refactor: move NLP to analysis dir

fix(backend): broken null timestamp handling
feat(nlp): add Named Entity Recognition to dataset
2026-02-17 18:51:15 +00:00 · 2026-02-17 18:49:03 +00:00 · 2026-02-17 18:48:45 +00:00 · 2026-02-17 18:26:40 +00:00 · 2026-02-17 18:00:16 +00:00 · 2026-02-17 17:40:29 +00:00
12 changed files with 827 additions and 255 deletions
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -15,6 +15,7 @@
        "headlessui": "^0.0.0",
        "react": "^19.2.0",
        "react-dom": "^19.2.0",
        "react-force-graph-3d": "^1.29.1",
        "react-router-dom": "^7.13.0",
        "recharts": "^3.7.0"
      },
@@ -267,6 +268,15 @@
        "@babel/core": "^7.0.0-0"
      }
    },
    "node_modules/@babel/runtime": {
      "version": "7.28.6",
      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz",
      "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==",
      "license": "MIT",
      "engines": {
        "node": ">=6.9.0"
      }
    },
    "node_modules/@babel/template": {
      "version": "7.28.6",
      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz",
@@ -2022,6 +2032,12 @@
        "url": "https://github.com/sponsors/tannerlinsley"
      }
    },
    "node_modules/@tweenjs/tween.js": {
      "version": "25.0.0",
      "resolved": "https://registry.npmjs.org/@tweenjs/tween.js/-/tween.js-25.0.0.tgz",
      "integrity": "sha512-XKLA6syeBUaPzx4j3qwMqzzq+V4uo72BnlbOjmuljLrRqdsd3qnzvZZoxvMHZ23ndsRS4aufU6JOZYpCbU6T1A==",
      "license": "MIT"
    },
    "node_modules/@types/babel__core": {
      "version": "7.20.5",
      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
@@ -2488,6 +2504,31 @@
        "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
      }
    },
    "node_modules/3d-force-graph": {
      "version": "1.79.1",
      "resolved": "https://registry.npmjs.org/3d-force-graph/-/3d-force-graph-1.79.1.tgz",
      "integrity": "sha512-iscIVt4jWjJ11KEEswgOIOWk8Ew4EFKHRyERJXJ0ouycqzHCtWwb9E5imnxS5rYF1f1IESkFNAfB+h3EkU0Irw==",
      "license": "MIT",
      "dependencies": {
        "accessor-fn": "1",
        "kapsule": "^1.16",
        "three": ">=0.118 <1",
        "three-forcegraph": "1",
        "three-render-objects": "^1.35"
      },
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/accessor-fn": {
      "version": "1.5.3",
      "resolved": "https://registry.npmjs.org/accessor-fn/-/accessor-fn-1.5.3.tgz",
      "integrity": "sha512-rkAofCwe/FvYFUlMB0v0gWmhqtfAtV1IUkdPbfhTUyYniu5LrC0A0UJkTH0Jv3S8SvwkmfuAlY+mQIJATdocMA==",
      "license": "MIT",
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/acorn": {
      "version": "8.15.0",
      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
@@ -2793,6 +2834,12 @@
        "node": ">=12"
      }
    },
    "node_modules/d3-binarytree": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/d3-binarytree/-/d3-binarytree-1.0.2.tgz",
      "integrity": "sha512-cElUNH+sHu95L04m92pG73t2MEJXKu+GeKUN1TJkFsu93E5W8E9Sc3kHEGJKgenGvj19m6upSn2EunvMgMD2Yw==",
      "license": "MIT"
    },
    "node_modules/d3-cloud": {
      "version": "1.2.8",
      "resolved": "https://registry.npmjs.org/d3-cloud/-/d3-cloud-1.2.8.tgz",
@@ -2826,6 +2873,22 @@
        "node": ">=12"
      }
    },
    "node_modules/d3-force-3d": {
      "version": "3.0.6",
      "resolved": "https://registry.npmjs.org/d3-force-3d/-/d3-force-3d-3.0.6.tgz",
      "integrity": "sha512-4tsKHUPLOVkyfEffZo1v6sFHvGFwAIIjt/W8IThbp08DYAsXZck+2pSHEG5W1+gQgEvFLdZkYvmJAbRM2EzMnA==",
      "license": "MIT",
      "dependencies": {
        "d3-binarytree": "1",
        "d3-dispatch": "1 - 3",
        "d3-octree": "1",
        "d3-quadtree": "1 - 3",
        "d3-timer": "1 - 3"
      },
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/d3-format": {
      "version": "3.1.2",
      "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz",
@@ -2847,6 +2910,12 @@
        "node": ">=12"
      }
    },
    "node_modules/d3-octree": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/d3-octree/-/d3-octree-1.1.0.tgz",
      "integrity": "sha512-F8gPlqpP+HwRPMO/8uOu5wjH110+6q4cgJvgJT6vlpy3BEaDIKlTZrgHKZSp/i1InRpVfh4puY/kvL6MxK930A==",
      "license": "MIT"
    },
    "node_modules/d3-path": {
      "version": "3.1.0",
      "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz",
@@ -2856,6 +2925,15 @@
        "node": ">=12"
      }
    },
    "node_modules/d3-quadtree": {
      "version": "3.0.1",
      "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-3.0.1.tgz",
      "integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==",
      "license": "ISC",
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/d3-scale": {
      "version": "4.0.2",
      "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz",
@@ -2958,6 +3036,18 @@
        "d3-selection": "2 - 3"
      }
    },
    "node_modules/data-bind-mapper": {
      "version": "1.0.3",
      "resolved": "https://registry.npmjs.org/data-bind-mapper/-/data-bind-mapper-1.0.3.tgz",
      "integrity": "sha512-QmU3lyEnbENQPo0M1F9BMu4s6cqNNp8iJA+b/HP2sSb7pf3dxwF3+EP1eO69rwBfH9kFJ1apmzrtogAmVt2/Xw==",
      "license": "MIT",
      "dependencies": {
        "accessor-fn": "1"
      },
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/debug": {
      "version": "4.4.3",
      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
@@ -3419,6 +3509,20 @@
      "dev": true,
      "license": "ISC"
    },
    "node_modules/float-tooltip": {
      "version": "1.7.5",
      "resolved": "https://registry.npmjs.org/float-tooltip/-/float-tooltip-1.7.5.tgz",
      "integrity": "sha512-/kXzuDnnBqyyWyhDMH7+PfP8J/oXiAavGzcRxASOMRHFuReDtofizLLJsf7nnDLAfEaMW4pVWaXrAjtnglpEkg==",
      "license": "MIT",
      "dependencies": {
        "d3-selection": "2 - 3",
        "kapsule": "^1.16",
        "preact": "10"
      },
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/follow-redirects": {
      "version": "1.15.11",
      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
@@ -3722,11 +3826,19 @@
      "dev": true,
      "license": "ISC"
    },
    "node_modules/jerrypick": {
      "version": "1.1.2",
      "resolved": "https://registry.npmjs.org/jerrypick/-/jerrypick-1.1.2.tgz",
      "integrity": "sha512-YKnxXEekXKzhpf7CLYA0A+oDP8V0OhICNCr5lv96FvSsDEmrb0GKM776JgQvHTMjr7DTTPEVv/1Ciaw0uEWzBA==",
      "license": "MIT",
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/js-tokens": {
      "version": "4.0.0",
      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/js-yaml": {
@@ -3789,6 +3901,18 @@
        "node": ">=6"
      }
    },
    "node_modules/kapsule": {
      "version": "1.16.3",
      "resolved": "https://registry.npmjs.org/kapsule/-/kapsule-1.16.3.tgz",
      "integrity": "sha512-4+5mNNf4vZDSwPhKprKwz3330iisPrb08JyMgbsdFrimBCKNHecua/WBwvVg3n7vwx0C1ARjfhwIpbrbd9n5wg==",
      "license": "MIT",
      "dependencies": {
        "lodash-es": "4"
      },
      "engines": {
        "node": ">=12"
      }
    },
    "node_modules/keyv": {
      "version": "4.5.4",
      "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
@@ -3835,6 +3959,12 @@
      "integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==",
      "license": "MIT"
    },
    "node_modules/lodash-es": {
      "version": "4.17.23",
      "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz",
      "integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==",
      "license": "MIT"
    },
    "node_modules/lodash.debounce": {
      "version": "4.0.8",
      "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz",
@@ -3848,6 +3978,18 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/loose-envify": {
      "version": "1.4.0",
      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
      "license": "MIT",
      "dependencies": {
        "js-tokens": "^3.0.0 || ^4.0.0"
      },
      "bin": {
        "loose-envify": "cli.js"
      }
    },
    "node_modules/lru-cache": {
      "version": "5.1.1",
      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
@@ -3934,6 +4076,44 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/ngraph.events": {
      "version": "1.4.0",
      "resolved": "https://registry.npmjs.org/ngraph.events/-/ngraph.events-1.4.0.tgz",
      "integrity": "sha512-NeDGI4DSyjBNBRtA86222JoYietsmCXbs8CEB0dZ51Xeh4lhVl1y3wpWLumczvnha8sFQIW4E0vvVWwgmX2mGw==",
      "license": "BSD-3-Clause"
    },
    "node_modules/ngraph.forcelayout": {
      "version": "3.3.1",
      "resolved": "https://registry.npmjs.org/ngraph.forcelayout/-/ngraph.forcelayout-3.3.1.tgz",
      "integrity": "sha512-MKBuEh1wujyQHFTW57y5vd/uuEOK0XfXYxm3lC7kktjJLRdt/KEKEknyOlc6tjXflqBKEuYBBcu7Ax5VY+S6aw==",
      "license": "BSD-3-Clause",
      "dependencies": {
        "ngraph.events": "^1.0.0",
        "ngraph.merge": "^1.0.0",
        "ngraph.random": "^1.0.0"
      }
    },
    "node_modules/ngraph.graph": {
      "version": "20.1.2",
      "resolved": "https://registry.npmjs.org/ngraph.graph/-/ngraph.graph-20.1.2.tgz",
      "integrity": "sha512-W/G3GBR3Y5UxMLHTUCPP9v+pbtpzwuAEIqP5oZV+9IwgxAIEZwh+Foc60iPc1idlnK7Zxu0p3puxAyNmDvBd0Q==",
      "license": "BSD-3-Clause",
      "dependencies": {
        "ngraph.events": "^1.4.0"
      }
    },
    "node_modules/ngraph.merge": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/ngraph.merge/-/ngraph.merge-1.0.0.tgz",
      "integrity": "sha512-5J8YjGITUJeapsomtTALYsw7rFveYkM+lBj3QiYZ79EymQcuri65Nw3knQtFxQBU1r5iOaVRXrSwMENUPK62Vg==",
      "license": "MIT"
    },
    "node_modules/ngraph.random": {
      "version": "1.2.0",
      "resolved": "https://registry.npmjs.org/ngraph.random/-/ngraph.random-1.2.0.tgz",
      "integrity": "sha512-4EUeAGbB2HWX9njd6bP6tciN6ByJfoaAvmVL9QTaZSeXrW46eNGA9GajiXiPBbvFqxUWFkEbyo6x5qsACUuVfA==",
      "license": "BSD-3-Clause"
    },
    "node_modules/node-releases": {
      "version": "2.0.27",
      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
@@ -3941,6 +4121,15 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/object-assign": {
      "version": "4.1.1",
      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
      "license": "MIT",
      "engines": {
        "node": ">=0.10.0"
      }
    },
    "node_modules/optionator": {
      "version": "0.9.4",
      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4044,6 +4233,18 @@
        "url": "https://github.com/sponsors/jonschlinkert"
      }
    },
    "node_modules/polished": {
      "version": "4.3.1",
      "resolved": "https://registry.npmjs.org/polished/-/polished-4.3.1.tgz",
      "integrity": "sha512-OBatVyC/N7SCW/FaDHrSd+vn0o5cS855TOmYi4OkdWUMSJCET/xip//ch8xGUvtr3i44X9LVyWwQlRMTN3pwSA==",
      "license": "MIT",
      "dependencies": {
        "@babel/runtime": "^7.17.8"
      },
      "engines": {
        "node": ">=10"
      }
    },
    "node_modules/postcss": {
      "version": "8.5.6",
      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
@@ -4073,6 +4274,16 @@
        "node": "^10 || ^12 || >=14"
      }
    },
    "node_modules/preact": {
      "version": "10.28.3",
      "resolved": "https://registry.npmjs.org/preact/-/preact-10.28.3.tgz",
      "integrity": "sha512-tCmoRkPQLpBeWzpmbhryairGnhW9tKV6c6gr/w+RhoRoKEJwsjzipwp//1oCpGPOchvSLaAPlpcJi9MwMmoPyA==",
      "license": "MIT",
      "funding": {
        "type": "opencollective",
        "url": "https://opencollective.com/preact"
      }
    },
    "node_modules/prelude-ls": {
      "version": "1.2.1",
      "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
@@ -4083,6 +4294,23 @@
        "node": ">= 0.8.0"
      }
    },
    "node_modules/prop-types": {
      "version": "15.8.1",
      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
      "license": "MIT",
      "dependencies": {
        "loose-envify": "^1.4.0",
        "object-assign": "^4.1.1",
        "react-is": "^16.13.1"
      }
    },
    "node_modules/prop-types/node_modules/react-is": {
      "version": "16.13.1",
      "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
      "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
      "license": "MIT"
    },
    "node_modules/proxy-from-env": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
@@ -4120,6 +4348,23 @@
        "react": "^19.2.4"
      }
    },
    "node_modules/react-force-graph-3d": {
      "version": "1.29.1",
      "resolved": "https://registry.npmjs.org/react-force-graph-3d/-/react-force-graph-3d-1.29.1.tgz",
      "integrity": "sha512-5Vp+PGpYnO+zLwgK2NvNqdXHvsWLrFzpDfJW1vUA1twjo9SPvXqfUYQrnRmAbD+K2tOxkZw1BkbH31l5b4TWHg==",
      "license": "MIT",
      "dependencies": {
        "3d-force-graph": "^1.79",
        "prop-types": "15",
        "react-kapsule": "^2.5"
      },
      "engines": {
        "node": ">=12"
      },
      "peerDependencies": {
        "react": "*"
      }
    },
    "node_modules/react-is": {
      "version": "19.2.4",
      "resolved": "https://registry.npmjs.org/react-is/-/react-is-19.2.4.tgz",
@@ -4127,6 +4372,21 @@
      "license": "MIT",
      "peer": true
    },
    "node_modules/react-kapsule": {
      "version": "2.5.7",
      "resolved": "https://registry.npmjs.org/react-kapsule/-/react-kapsule-2.5.7.tgz",
      "integrity": "sha512-kifAF4ZPD77qZKc4CKLmozq6GY1sBzPEJTIJb0wWFK6HsePJatK3jXplZn2eeAt3x67CDozgi7/rO8fNQ/AL7A==",
      "license": "MIT",
      "dependencies": {
        "jerrypick": "^1.1.1"
      },
      "engines": {
        "node": ">=12"
      },
      "peerDependencies": {
        "react": ">=16.13.1"
      }
    },
    "node_modules/react-redux": {
      "version": "9.2.0",
      "resolved": "https://registry.npmjs.org/react-redux/-/react-redux-9.2.0.tgz",
@@ -4413,12 +4673,67 @@
      "integrity": "sha512-05PUHKSNE8ou2dwIxTngl4EzcnsCDZGJ/iCLtDflR/SHB/ny14rXc+qU5P4mG9JkusiV7EivzY9Mhm55AzAvCg==",
      "license": "MIT"
    },
    "node_modules/three": {
      "version": "0.182.0",
      "resolved": "https://registry.npmjs.org/three/-/three-0.182.0.tgz",
      "integrity": "sha512-GbHabT+Irv+ihI1/f5kIIsZ+Ef9Sl5A1Y7imvS5RQjWgtTPfPnZ43JmlYI7NtCRDK9zir20lQpfg8/9Yd02OvQ==",
      "license": "MIT"
    },
    "node_modules/three-forcegraph": {
      "version": "1.43.1",
      "resolved": "https://registry.npmjs.org/three-forcegraph/-/three-forcegraph-1.43.1.tgz",
      "integrity": "sha512-lQnYPLvR31gb91mF5xHhU0jPHJgBPw9QB23R6poCk8Tgvz8sQtq7wTxwClcPdfKCBbHXsb7FSqK06Osiu1kQ5A==",
      "license": "MIT",
      "dependencies": {
        "accessor-fn": "1",
        "d3-array": "1 - 3",
        "d3-force-3d": "2 - 3",
        "d3-scale": "1 - 4",
        "d3-scale-chromatic": "1 - 3",
        "data-bind-mapper": "1",
        "kapsule": "^1.16",
        "ngraph.forcelayout": "3",
        "ngraph.graph": "20",
        "tinycolor2": "1"
      },
      "engines": {
        "node": ">=12"
      },
      "peerDependencies": {
        "three": ">=0.118.3"
      }
    },
    "node_modules/three-render-objects": {
      "version": "1.40.4",
      "resolved": "https://registry.npmjs.org/three-render-objects/-/three-render-objects-1.40.4.tgz",
      "integrity": "sha512-Ukpu1pei3L5r809izvjsZxwuRcYLiyn6Uvy3lZ9bpMTdvj3i6PeX6w++/hs2ZS3KnEzGjb6YvTvh4UQuwHTDJg==",
      "license": "MIT",
      "dependencies": {
        "@tweenjs/tween.js": "18 - 25",
        "accessor-fn": "1",
        "float-tooltip": "^1.7",
        "kapsule": "^1.16",
        "polished": "4"
      },
      "engines": {
        "node": ">=12"
      },
      "peerDependencies": {
        "three": ">=0.168"
      }
    },
    "node_modules/tiny-invariant": {
      "version": "1.3.3",
      "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
      "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==",
      "license": "MIT"
    },
    "node_modules/tinycolor2": {
      "version": "1.6.0",
      "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
      "integrity": "sha512-XPaBkWQJdsf3pLKJV9p4qN/S+fm2Oj8AIPo1BTUhg5oxkvm9+SVEGFdhyOz7tTdUTfvxMiAs4sp6/eZO2Ew+pw==",
      "license": "MIT"
    },
    "node_modules/tinyglobby": {
      "version": "0.2.15",
      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -17,6 +17,7 @@
    "headlessui": "^0.0.0",
    "react": "^19.2.0",
    "react-dom": "^19.2.0",
    "react-force-graph-3d": "^1.29.1",
    "react-router-dom": "^7.13.0",
    "recharts": "^3.7.0"
  },
--- a/frontend/src/components/InteractionStats.tsx
+++ b/frontend/src/components/InteractionStats.tsx
@@ -0,0 +1,61 @@
 import ForceGraph3D from "react-force-graph-3d";
 import {
    type UserAnalysisResponse,
    type InteractionGraph
 } from '../types/ApiTypes';
 import StatsStyling from "../styles/stats_styling";
 const styles = StatsStyling;
 function ApiToGraphData(apiData: InteractionGraph) {
    const nodes = Object.keys(apiData).map(username => ({ id: username }));
    const links = [];
    for (const [source, targets] of Object.entries(apiData)) {
        for (const [target, count] of Object.entries(targets)) {
            links.push({ source, target, value: count });
        }
    }
    // drop low-value and deleted interactions to reduce clutter
    const filteredLinks = links.filter(link => 
        link.value >= 2 && 
        link.source !== "[deleted]" && 
        link.target !== "[deleted]"
    );
    // also filter out nodes that are no longer connected after link filtering
    const connectedNodeIds = new Set(filteredLinks.flatMap(link => [link.source, link.target]));
    const filteredNodes = nodes.filter(node => connectedNodeIds.has(node.id));
    return { nodes: filteredNodes, links: filteredLinks};
 }
 const InteractionStats = (props: { data: UserAnalysisResponse }) => {
  const graphData = ApiToGraphData(props.data.interaction_graph);
  return (
    <div style={styles.page}>
        <h2 style={styles.sectionTitle}>User Interaction Graph</h2>
        <p style={styles.sectionSubtitle}>
            This graph visualizes interactions between users based on comments and replies. 
            Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
        </p>
        <div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}>
            <ForceGraph3D
                graphData={graphData}
                nodeAutoColorBy="id"
                linkDirectionalParticles={2}
                linkDirectionalParticleSpeed={0.005}
                linkWidth={(link) => Math.sqrt(link.value)}
                nodeLabel={(node) => `${node.id}`}
            />
        </div>
    </div>
  );
 }
 export default InteractionStats;
--- a/frontend/src/pages/Stats.tsx
+++ b/frontend/src/pages/Stats.tsx
@@ -3,6 +3,7 @@ import axios from "axios";
 import StatsStyling from "../styles/stats_styling";
 import SummaryStats from "../components/SummaryStats";
 import EmotionalStats from "../components/EmotionalStats";
 import InteractionStats from "../components/InteractionStats";
 import { 
  type SummaryResponse, 
@@ -16,7 +17,7 @@ const styles = StatsStyling;
 const StatPage = () => {
  const [error, setError] = useState('');
  const [loading, setLoading] = useState(false);
-  const [activeView, setActiveView] = useState<"summary" | "emotional">("summary");
+  const [activeView, setActiveView] = useState<"summary" | "emotional" | "interaction">("summary");
  const [userData, setUserData] = useState<UserAnalysisResponse | null>(null);
  const [timeData, setTimeData] = useState<TimeAnalysisResponse | null>(null);
@@ -133,6 +134,13 @@ return (
      >
        Emotional
      </button>
      <button
        onClick={() => setActiveView("interaction")}
        style={activeView === "interaction" ? styles.buttonPrimary : styles.buttonSecondary}
      >
        Interaction
      </button>
    </div>
    {activeView === "summary" && (
@@ -154,6 +162,10 @@ return (
      </div>
    )}
    {activeView === "interaction" && userData && (
      <InteractionStats data={userData} />
    )}
  </div>
 );
 }
--- a/frontend/src/types/ApiTypes.ts
+++ b/frontend/src/types/ApiTypes.ts
@@ -35,9 +35,12 @@ type User = {
  vocab?: Vocab | null;
 };
 type InteractionGraph = Record<string, Record<string, number>>;
 type UserAnalysisResponse = {
  top_users: TopUser[];
  users: User[];
  interaction_graph: InteractionGraph;
 };
 // Time Analysis
@@ -89,6 +92,7 @@ export type {
    TopUser,
    Vocab,
    User,
    InteractionGraph,
    UserAnalysisResponse,
    FrequencyWord,
    AverageEmotionByTopic,
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -0,0 +1,41 @@
 import pandas as pd
 class EmotionalAnalysis:
    def __init__(self, df: pd.DataFrame):
        self.df = df
    def avg_emotion_by_topic(self) -> dict:
        emotion_exclusions = [
            "emotion_neutral",
            "emotion_surprise"
        ]
        emotion_cols = [
            col for col in self.df.columns
            if col.startswith("emotion_") and col not in emotion_exclusions
        ]
        counts = (
            self.df[
                (self.df["topic"] != "Misc")
            ]
            .groupby("topic")
            .size()
            .rename("n")
        )
        avg_emotion_by_topic = (
            self.df[
                (self.df["topic"] != "Misc")
            ]
            .groupby("topic")[emotion_cols]
            .mean()
            .reset_index()
        )
        avg_emotion_by_topic = avg_emotion_by_topic.merge(
            counts,
            on="topic"
        )
        return avg_emotion_by_topic.to_dict(orient='records')
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -0,0 +1,126 @@
 import pandas as pd
 import re
 from collections import Counter
 class InteractionAnalysis:
    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
        return [t for t in tokens if t not in self.word_exclusions]
    def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list:
        df = self.df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)
        rows = []
        for author, group in df.groupby("author"):
            all_tokens = [t for tokens in group["tokens"] for t in tokens]
            total_words = len(all_tokens)
            unique_words = len(set(all_tokens))
            events = len(group)
            # Min amount of words for a user, any less than this might give weird results
            if total_words < min_words:
                continue
            # 100% = they never reused a word (excluding stop words)
            vocab_richness = unique_words / total_words
            avg_words = total_words / max(events, 1)
            counts = Counter(all_tokens)
            top_words = [
                {"word": w, "count": int(c)}
                for w, c in counts.most_common(top_most_used_words)
            ]
            rows.append({
                "author": author,
                "events": int(events),
                "total_words": int(total_words),
                "unique_words": int(unique_words),
                "vocab_richness": round(vocab_richness, 3),
                "avg_words_per_event": round(avg_words, 2),
                "top_words": top_words
            })
        rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
        return rows
    def top_users(self) -> list:
        counts = (
            self.df.groupby(["author", "source"])
            .size()
            .sort_values(ascending=False)
        )
        top_users = [
            {"author": author, "source": source, "count": int(count)}
            for (author, source), count in counts.items()
        ]
        return top_users
    def per_user_analysis(self) -> dict:
        per_user = (
            self.df.groupby(["author", "type"])
            .size()
            .unstack(fill_value=0)
        )
        # ensure columns always exist
        for col in ("post", "comment"):
            if col not in per_user.columns:
                per_user[col] = 0
        per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1)
        per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1)
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")
        vocab_rows = self._vocab_richness_per_user()
        vocab_by_author = {row["author"]: row for row in vocab_rows}
        # merge vocab richness + per_user information
        merged_users = []
        for row in per_user_records:
            author = row["author"]
            merged_users.append({
                "author": author,
                "post": int(row.get("post", 0)),
                "comment": int(row.get("comment", 0)),
                "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                "comment_share": float(row.get("comment_share", 0)),
                "vocab": vocab_by_author.get(author)
            })
        merged_users.sort(key=lambda u: u["comment_post_ratio"])
        return merged_users
    def interaction_graph(self):
        interactions = {a: {} for a in self.df["author"].dropna().unique()}
        # reply_to refers to the comment id, this allows us to map comment ids to usernames
        id_to_author = self.df.set_index("id")["author"].to_dict()
        for _, row in self.df.iterrows():
            a = row["author"]
            reply_id = row["reply_to"]
            if pd.isna(a) or pd.isna(reply_id) or reply_id == "":
                continue
            b = id_to_author.get(reply_id)
            if b is None or a == b:
                continue
            interactions[a][b] = interactions[a].get(b, 0) + 1
        return interactions
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -0,0 +1,68 @@
 import pandas as pd
 import re
 from collections import Counter
 from itertools import islice
 class LinguisticAnalysis:
    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _clean_text(self, text: str) -> str:
        text = re.sub(r"http\S+", "", text)        # remove URLs
        text = re.sub(r"www\S+", "", text)
        text = re.sub(r"&\w+;", "", text)          # remove HTML entities
        text = re.sub(r"\bamp\b", "", text)        # remove stray amp
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text
    def word_frequencies(self, limit: int = 100) -> dict:
        texts = (
            self.df["content"]
            .dropna()
            .astype(str)
            .str.lower()
        )
        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
            words.extend(
                w for w in tokens
                if w not in self.word_exclusions
            )
        counts = Counter(words)
        word_frequencies = (
            pd.DataFrame(counts.items(), columns=["word", "count"])
            .sort_values("count", ascending=False)
            .head(limit)
            .reset_index(drop=True)
        )
        return word_frequencies.to_dict(orient="records")
    def ngrams(self, n=2, limit=100):
        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
        all_ngrams = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
            # stop word removal causes strange behaviors in ngrams
            #tokens = [w for w in tokens if w not in self.word_exclusions]
            ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
            all_ngrams.extend([" ".join(ng) for ng in ngrams])
        counts = Counter(all_ngrams)
        return (
            pd.DataFrame(counts.items(), columns=["ngram", "count"])
            .sort_values("count", ascending=False)
            .head(limit)
            .to_dict(orient="records")
        )
--- a/server/analysis/nlp.py
+++ b/server/analysis/nlp.py
@@ -9,6 +9,7 @@ from sentence_transformers import SentenceTransformer
 class NLP:
    _topic_models: dict[str, SentenceTransformer] = {}
    _emotion_classifiers: dict[str, Any] = {}
    _entity_recognizers: dict[str, Any] = {}
    _topic_embedding_cache: dict[tuple[str, ...], np.ndarray] = {}
    def __init__(
@@ -29,6 +30,9 @@ class NLP:
            self.emotion_classifier = self._get_emotion_classifier(
                self.device_str, self.pipeline_device
            )
            self.entity_recognizer = self._get_entity_recognizer(
                self.device_str, self.pipeline_device
            )           
        except RuntimeError as exc:
            if self.use_cuda and "out of memory" in str(exc).lower():
                torch.cuda.empty_cache()
@@ -87,6 +91,27 @@ class NLP:
            cls._emotion_classifiers[device_str] = classifier
        return classifier
    @classmethod
    def _get_entity_recognizer(cls, device_str: str, pipeline_device: int) -> Any:
        recognizer = cls._entity_recognizers.get(device_str)
        if recognizer is None:
            pipeline_kwargs = {
                "aggregation_strategy": "simple",  # merges subwords
                "device": pipeline_device,
            }
            if device_str == "cuda":
                pipeline_kwargs["dtype"] = torch.float16
            recognizer = pipeline(
                "token-classification",
                model="dslim/bert-base-NER",
                **pipeline_kwargs,
            )
            cls._entity_recognizers[device_str] = recognizer
        return recognizer
    def _encode_with_backoff(
        self, texts: list[str], initial_batch_size: int
    ) -> np.ndarray:
@@ -129,6 +154,26 @@ class NLP:
                    continue
                raise
    def _infer_entities_with_backoff(
        self, texts: list[str], initial_batch_size: int
    ) -> list[list[dict[str, Any]]]:
        batch_size = initial_batch_size
        while True:
            try:
                return self.entity_recognizer(texts, batch_size=batch_size)
            except RuntimeError as exc:
                if (
                    self.use_cuda
                    and "out of memory" in str(exc).lower()
                    and batch_size > 4
                ):
                    batch_size = max(4, batch_size // 2)
                    torch.cuda.empty_cache()
                    continue
                raise
    def add_emotion_cols(self) -> None:
        texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()
@@ -183,3 +228,51 @@ class NLP:
        self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = (
            "Misc"
        )
    def add_ner_cols(self, max_chars: int = 512) -> None:
        texts = (
            self.df[self.content_col]
            .fillna("")
            .astype(str)
            .str.slice(0, max_chars)
            .tolist()
        )
        if not texts:
            self.df["entities"] = []
            self.df["entity_counts"] = []
            return
        results = self._infer_entities_with_backoff(texts, 32 if self.use_cuda else 8)
        entity_lists = []
        entity_count_dicts = []
        for row in results:
            entities = []
            counts = {}
            for ent in row:
                word = ent.get("word")
                label = ent.get("entity_group")
                if isinstance(word, str) and isinstance(label, str):
                    entities.append({"text": word, "label": label})
                    counts[label] = counts.get(label, 0) + 1
            entity_lists.append(entities)
            entity_count_dicts.append(counts)
        self.df["entities"] = entity_lists
        self.df["entity_counts"] = entity_count_dicts
        # Expand label counts into columns
        all_labels = set()
        for d in entity_count_dicts:
            all_labels.update(d.keys())
        for label in all_labels:
            col_name = f"entity_{label}"
            self.df[col_name] = [
                d.get(label, 0) for d in entity_count_dicts
            ]
--- a/server/analysis/temporal.py
+++ b/server/analysis/temporal.py
@@ -0,0 +1,70 @@
 import pandas as pd
 class TemporalAnalysis:
    def __init__(self, df: pd.DataFrame):
        self.df = df
    def avg_reply_time_per_emotion(self) -> dict:
        df = self.df.copy()
        replies = df[
            (df["type"] == "comment") &
            (df["reply_to"].notna()) &
            (df["reply_to"] != "")
        ]
        id_to_time = df.set_index("id")["dt"].to_dict()
        def compute_reply_time(row):
            reply_id = row["reply_to"]
            parent_time = id_to_time.get(reply_id)
            if parent_time is None:
                return None
            return (row["dt"] - parent_time).total_seconds()
        replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
        replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
        grouped = (
            replies
            .groupby("dominant_emotion")["reply_time"]
            .agg(["mean", "count"])
            .reset_index()
        )
        return grouped.to_dict(orient="records")
    def posts_per_day(self) -> dict:
        per_day = (
            self.df.groupby("date")
            .size()
            .reset_index(name="count")
        )
        return per_day.to_dict(orient="records")
    def heatmap(self) -> dict:
        weekday_order = [
            "Monday", "Tuesday", "Wednesday",
            "Thursday", "Friday", "Saturday", "Sunday"
        ]
        self.df["weekday"] = pd.Categorical(
            self.df["weekday"],
            categories=weekday_order,
            ordered=True
        )
        heatmap = (
            self.df
            .groupby(["weekday", "hour"], observed=True)
            .size()
            .unstack(fill_value=0)
            .reindex(columns=range(24), fill_value=0)
        )
        heatmap.columns = heatmap.columns.map(str)
        return heatmap.to_dict(orient="records")
--- a/server/app.py
+++ b/server/app.py
@@ -12,7 +12,7 @@ app = Flask(__name__)
 CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
 # Global State
-posts_df = pd.read_json('posts.jsonl', lines=True)
+posts_df = pd.read_json('small.jsonl', lines=True)
 with open("topic_buckets.json", "r", encoding="utf-8") as f:
    domain_topics = json.load(f)
 stat_obj = StatGen(posts_df, domain_topics)
@@ -47,7 +47,7 @@ def get_dataset():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
-    return jsonify(stat_obj.df.to_dict(orient="records")), 200
+    return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"}
@app.route('/stats/content', methods=['GET'])
 def word_frequencies():
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -1,11 +1,13 @@
 import pandas as pd
 import re
 import nltk
 import datetime
 import nltk
 from nltk.corpus import stopwords
-from collections import Counter
+from server.analysis.nlp import NLP
-from server.nlp import NLP
+from server.analysis.temporal import TemporalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -23,6 +25,7 @@ EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
 class StatGen:
    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
        comments_df = df[["id", "comments"]].explode("comments")
        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
        comments_df = pd.json_normalize(comments_df["comments"])
        posts_df = df.drop(columns=["comments"])
@@ -35,9 +38,15 @@ class StatGen:
        self.df = pd.concat([posts_df, comments_df])
        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
        self.nlp = NLP(self.df, "title", "content", domain_topics)
        self._add_extra_cols(self.df)
        self.temporal_analysis = TemporalAnalysis(self.df)
        self.emotional_analysis = EmotionalAnalysis(self.df)
        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
        self.original_df = self.df.copy(deep=True)
    ## Private Methods
@@ -50,141 +59,29 @@ class StatGen:
        self.nlp.add_emotion_cols()
        self.nlp.add_topic_col()
-
+        self.nlp.add_ner_cols()
    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
        return [t for t in tokens if t not in EXCLUDE_WORDS]
    def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list:
        df = self.df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)
        rows = []
        for author, group in df.groupby("author"):
            all_tokens = [t for tokens in group["tokens"] for t in tokens]
            total_words = len(all_tokens)
            unique_words = len(set(all_tokens))
            events = len(group)
            # Min amount of words for a user, any less than this might give weird results
            if total_words < min_words:
                continue
            # 100% = they never reused a word (excluding stop words)
            vocab_richness = unique_words / total_words
            avg_words = total_words / max(events, 1)
            counts = Counter(all_tokens)
            top_words = [
                {"word": w, "count": int(c)}
                for w, c in counts.most_common(top_most_used_words)
            ]
            rows.append({
                "author": author,
                "events": int(events),
                "total_words": int(total_words),
                "unique_words": int(unique_words),
                "vocab_richness": round(vocab_richness, 3),
                "avg_words_per_event": round(avg_words, 2),
                "top_words": top_words
            })
        rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
        return rows
    def _interaction_graph(self):
        interactions = {a: {} for a in self.df["author"].dropna().unique()}
        # reply_to refers to the comment id, this allows us to map comment ids to usernames
        id_to_author = self.df.set_index("id")["author"].to_dict()
        for _, row in self.df.iterrows():
            a = row["author"]
            reply_id = row["reply_to"]
            if pd.isna(a) or pd.isna(reply_id) or reply_id == "":
                continue
            b = id_to_author.get(reply_id)
            if b is None or a == b:
                continue
            interactions[a][b] = interactions[a].get(b, 0) + 1
        return interactions
    def _avg_reply_time_per_emotion(self):
        df = self.df.copy()
        replies = df[
            (df["type"] == "comment") &
            (df["reply_to"].notna()) &
            (df["reply_to"] != "")
        ]
        id_to_time = df.set_index("id")["dt"].to_dict()
        def compute_reply_time(row):
            reply_id = row["reply_to"]
            parent_time = id_to_time.get(reply_id)
            if parent_time is None:
                return None
            return (row["dt"] - parent_time).total_seconds()
        replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
        replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
        grouped = (
            replies
            .groupby("dominant_emotion")["reply_time"]
            .agg(["mean", "count"])
            .reset_index()
        )
        return grouped.to_dict(orient="records")
    ## Public
    def time_analysis(self) -> pd.DataFrame:
        per_day = (
            self.df.groupby("date")
            .size()
            .reset_index(name="count")
        )
        weekday_order = [
            "Monday", "Tuesday", "Wednesday",
            "Thursday", "Friday", "Saturday", "Sunday"
        ]
        self.df["weekday"] = pd.Categorical(
            self.df["weekday"],
            categories=weekday_order,
            ordered=True
        )
        heatmap = (
            self.df
            .groupby(["weekday", "hour"], observed=True)
            .size()
            .unstack(fill_value=0)
            .reindex(columns=range(24), fill_value=0)
        )
        heatmap.columns = heatmap.columns.map(str)
        burst_index = per_day["count"].std() / max(per_day["count"].mean(), 1)
        return {
-            "events_per_day": per_day.to_dict(orient="records"),
+            "events_per_day": self.temporal_analysis.posts_per_day(),
-            "weekday_hour_heatmap": heatmap.to_dict(orient="records"),
+            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
-            "burstiness": round(burst_index, 2)
+        }
    def content_analysis(self) -> dict:
        return {
            "word_frequencies": self.linguistic_analysis.word_frequencies(),
            "common_two_phrases": self.linguistic_analysis.ngrams(),
            "common_three_phrases": self.linguistic_analysis.ngrams(n=3),
            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
        }
    def user_analysis(self) -> dict:
        return {
            "top_users": self.interaction_analysis.top_users(),
            "users": self.interaction_analysis.per_user_analysis(),
            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
    def summary(self) -> dict:
@@ -207,122 +104,6 @@ class StatGen:
            "sources": self.df["source"].dropna().unique().tolist()
        }
    def content_analysis(self, limit: int = 100) -> dict:
        texts = (
            self.df["content"]
            .dropna()
            .astype(str)
            .str.lower()
        )
        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
            words.extend(
                w for w in tokens
                if w not in EXCLUDE_WORDS
            )
        counts = Counter(words)
        word_frequencies = (
            pd.DataFrame(counts.items(), columns=["word", "count"])
            .sort_values("count", ascending=False)
            .head(limit)
            .reset_index(drop=True)
        )
        emotion_exclusions = [
            "emotion_neutral",
            "emotion_surprise"
        ]
        emotion_cols = [
            col for col in self.df.columns
            if col.startswith("emotion_") and col not in emotion_exclusions
        ]
        counts = (
            self.df[
                (self.df["topic"] != "Misc")
            ]
            .groupby("topic")
            .size()
            .rename("n")
        )
        avg_emotion_by_topic = (
            self.df[
                (self.df["topic"] != "Misc")
            ]
            .groupby("topic")[emotion_cols]
            .mean()
            .reset_index()
        )
        avg_emotion_by_topic = avg_emotion_by_topic.merge(
            counts,
            on="topic"
        )
        return {
            "word_frequencies": word_frequencies.to_dict(orient='records'),
            "average_emotion_by_topic": avg_emotion_by_topic.to_dict(orient='records'),
            "reply_time_by_emotion": self._avg_reply_time_per_emotion()
        }
    def user_analysis(self) -> dict:
        counts = (
            self.df.groupby(["author", "source"])
            .size()
            .sort_values(ascending=False)
        )
        top_users = [
            {"author": author, "source": source, "count": int(count)}
            for (author, source), count in counts.items()
        ]
        per_user = (
            self.df.groupby(["author", "type"])
            .size()
            .unstack(fill_value=0)
        )
        # ensure columns always exist
        for col in ("post", "comment"):
            if col not in per_user.columns:
                per_user[col] = 0
        per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1)
        per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1)
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")
        vocab_rows = self._vocab_richness_per_user()
        vocab_by_author = {row["author"]: row for row in vocab_rows}
        # merge vocab richness + per_user information
        merged_users = []
        for row in per_user_records:
            author = row["author"]
            merged_users.append({
                "author": author,
                "post": int(row.get("post", 0)),
                "comment": int(row.get("comment", 0)),
                "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                "comment_share": float(row.get("comment_share", 0)),
                "vocab": vocab_by_author.get(author)
            })
        merged_users.sort(key=lambda u: u["comment_post_ratio"])
        return {
            "top_users": top_users,
            "users": merged_users,
            "interaction_graph": self._interaction_graph()
        }
    def search(self, search_query: str) -> dict:
        self.df = self.df[
            self.df["content"].str.contains(search_query)
Author	SHA1	Message	Date
Dylan De Faoite	c11b4bb85b	refactor: move NLP to analysis dir	2026-02-17 18:51:15 +00:00
Dylan De Faoite	289f4254db	fix(backend): broken null timestamp handling	2026-02-17 18:49:03 +00:00
Dylan De Faoite	ed0dd8cdbc	feat(nlp): add Named Entity Recognition to dataset	2026-02-17 18:48:45 +00:00
Dylan De Faoite	8fbf32b67c	feat(linguistic): add most common 2, 3 length n-grams	2026-02-17 18:26:40 +00:00
Dylan De Faoite	d27ba3fca4	refactor: extract interaction and linguistic analysis into dedicated classes	2026-02-17 18:00:16 +00:00
Dylan De Faoite	83010aee55	refactor: extract emotional analysis out of stat_gen	2026-02-17 17:40:29 +00:00
Dylan De Faoite	70b34036db	refactor: extract temporal analysis into it's own class	2026-02-17 17:35:28 +00:00
Dylan De Faoite	563212c98e	perf(frontend): add filter for low interaction graphs & deleted users	2026-02-16 17:09:22 +00:00
Dylan De Faoite	4f577abd4f	feat(frontend): add 3d interaction graph	2026-02-16 17:03:51 +00:00
Dylan De Faoite	7c1e069152	fix(backend): comment parsing didn't account of NaN values	2026-02-16 16:41:16 +00:00