Compare commits

...

15 Commits

Author SHA1 Message Date
257eb80de7 feat(api): add average thread length per emotion 2026-02-23 19:09:48 +00:00
3a23b1f0c8 feat(api): add average thread depth 2026-02-23 18:14:34 +00:00
8c76476cd3 fix(api): broken analysis calls due to overlap in attribute and method names 2026-02-23 18:14:24 +00:00
397986dc89 refactor(frontend): rename InteractionStats to UserStats 2026-02-23 17:15:14 +00:00
04b7094036 feat(api): add cultural endpoint 2026-02-23 17:14:12 +00:00
c11b4bb85b refactor: move NLP to analysis dir 2026-02-17 18:51:15 +00:00
289f4254db fix(backend): broken null timestamp handling 2026-02-17 18:49:03 +00:00
ed0dd8cdbc feat(nlp): add Named Entity Recognition to dataset 2026-02-17 18:48:45 +00:00
8fbf32b67c feat(linguistic): add most common 2, 3 length n-grams 2026-02-17 18:26:40 +00:00
d27ba3fca4 refactor: extract interaction and linguistic analysis into dedicated classes 2026-02-17 18:00:16 +00:00
83010aee55 refactor: extract emotional analysis out of stat_gen 2026-02-17 17:40:29 +00:00
70b34036db refactor: extract temporal analysis into it's own class 2026-02-17 17:35:28 +00:00
563212c98e perf(frontend): add filter for low interaction graphs & deleted users 2026-02-16 17:09:22 +00:00
4f577abd4f feat(frontend): add 3d interaction graph 2026-02-16 17:03:51 +00:00
7c1e069152 fix(backend): comment parsing didn't account of NaN values 2026-02-16 16:41:16 +00:00
12 changed files with 1005 additions and 257 deletions

View File

@@ -15,6 +15,7 @@
"headlessui": "^0.0.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
"react-force-graph-3d": "^1.29.1",
"react-router-dom": "^7.13.0",
"recharts": "^3.7.0"
},
@@ -267,6 +268,15 @@
"@babel/core": "^7.0.0-0"
}
},
"node_modules/@babel/runtime": {
"version": "7.28.6",
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz",
"integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==",
"license": "MIT",
"engines": {
"node": ">=6.9.0"
}
},
"node_modules/@babel/template": {
"version": "7.28.6",
"resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz",
@@ -2022,6 +2032,12 @@
"url": "https://github.com/sponsors/tannerlinsley"
}
},
"node_modules/@tweenjs/tween.js": {
"version": "25.0.0",
"resolved": "https://registry.npmjs.org/@tweenjs/tween.js/-/tween.js-25.0.0.tgz",
"integrity": "sha512-XKLA6syeBUaPzx4j3qwMqzzq+V4uo72BnlbOjmuljLrRqdsd3qnzvZZoxvMHZ23ndsRS4aufU6JOZYpCbU6T1A==",
"license": "MIT"
},
"node_modules/@types/babel__core": {
"version": "7.20.5",
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
@@ -2488,6 +2504,31 @@
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
}
},
"node_modules/3d-force-graph": {
"version": "1.79.1",
"resolved": "https://registry.npmjs.org/3d-force-graph/-/3d-force-graph-1.79.1.tgz",
"integrity": "sha512-iscIVt4jWjJ11KEEswgOIOWk8Ew4EFKHRyERJXJ0ouycqzHCtWwb9E5imnxS5rYF1f1IESkFNAfB+h3EkU0Irw==",
"license": "MIT",
"dependencies": {
"accessor-fn": "1",
"kapsule": "^1.16",
"three": ">=0.118 <1",
"three-forcegraph": "1",
"three-render-objects": "^1.35"
},
"engines": {
"node": ">=12"
}
},
"node_modules/accessor-fn": {
"version": "1.5.3",
"resolved": "https://registry.npmjs.org/accessor-fn/-/accessor-fn-1.5.3.tgz",
"integrity": "sha512-rkAofCwe/FvYFUlMB0v0gWmhqtfAtV1IUkdPbfhTUyYniu5LrC0A0UJkTH0Jv3S8SvwkmfuAlY+mQIJATdocMA==",
"license": "MIT",
"engines": {
"node": ">=12"
}
},
"node_modules/acorn": {
"version": "8.15.0",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
@@ -2793,6 +2834,12 @@
"node": ">=12"
}
},
"node_modules/d3-binarytree": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/d3-binarytree/-/d3-binarytree-1.0.2.tgz",
"integrity": "sha512-cElUNH+sHu95L04m92pG73t2MEJXKu+GeKUN1TJkFsu93E5W8E9Sc3kHEGJKgenGvj19m6upSn2EunvMgMD2Yw==",
"license": "MIT"
},
"node_modules/d3-cloud": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/d3-cloud/-/d3-cloud-1.2.8.tgz",
@@ -2826,6 +2873,22 @@
"node": ">=12"
}
},
"node_modules/d3-force-3d": {
"version": "3.0.6",
"resolved": "https://registry.npmjs.org/d3-force-3d/-/d3-force-3d-3.0.6.tgz",
"integrity": "sha512-4tsKHUPLOVkyfEffZo1v6sFHvGFwAIIjt/W8IThbp08DYAsXZck+2pSHEG5W1+gQgEvFLdZkYvmJAbRM2EzMnA==",
"license": "MIT",
"dependencies": {
"d3-binarytree": "1",
"d3-dispatch": "1 - 3",
"d3-octree": "1",
"d3-quadtree": "1 - 3",
"d3-timer": "1 - 3"
},
"engines": {
"node": ">=12"
}
},
"node_modules/d3-format": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz",
@@ -2847,6 +2910,12 @@
"node": ">=12"
}
},
"node_modules/d3-octree": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/d3-octree/-/d3-octree-1.1.0.tgz",
"integrity": "sha512-F8gPlqpP+HwRPMO/8uOu5wjH110+6q4cgJvgJT6vlpy3BEaDIKlTZrgHKZSp/i1InRpVfh4puY/kvL6MxK930A==",
"license": "MIT"
},
"node_modules/d3-path": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz",
@@ -2856,6 +2925,15 @@
"node": ">=12"
}
},
"node_modules/d3-quadtree": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-3.0.1.tgz",
"integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/d3-scale": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz",
@@ -2958,6 +3036,18 @@
"d3-selection": "2 - 3"
}
},
"node_modules/data-bind-mapper": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/data-bind-mapper/-/data-bind-mapper-1.0.3.tgz",
"integrity": "sha512-QmU3lyEnbENQPo0M1F9BMu4s6cqNNp8iJA+b/HP2sSb7pf3dxwF3+EP1eO69rwBfH9kFJ1apmzrtogAmVt2/Xw==",
"license": "MIT",
"dependencies": {
"accessor-fn": "1"
},
"engines": {
"node": ">=12"
}
},
"node_modules/debug": {
"version": "4.4.3",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
@@ -3419,6 +3509,20 @@
"dev": true,
"license": "ISC"
},
"node_modules/float-tooltip": {
"version": "1.7.5",
"resolved": "https://registry.npmjs.org/float-tooltip/-/float-tooltip-1.7.5.tgz",
"integrity": "sha512-/kXzuDnnBqyyWyhDMH7+PfP8J/oXiAavGzcRxASOMRHFuReDtofizLLJsf7nnDLAfEaMW4pVWaXrAjtnglpEkg==",
"license": "MIT",
"dependencies": {
"d3-selection": "2 - 3",
"kapsule": "^1.16",
"preact": "10"
},
"engines": {
"node": ">=12"
}
},
"node_modules/follow-redirects": {
"version": "1.15.11",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
@@ -3722,11 +3826,19 @@
"dev": true,
"license": "ISC"
},
"node_modules/jerrypick": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/jerrypick/-/jerrypick-1.1.2.tgz",
"integrity": "sha512-YKnxXEekXKzhpf7CLYA0A+oDP8V0OhICNCr5lv96FvSsDEmrb0GKM776JgQvHTMjr7DTTPEVv/1Ciaw0uEWzBA==",
"license": "MIT",
"engines": {
"node": ">=12"
}
},
"node_modules/js-tokens": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
"dev": true,
"license": "MIT"
},
"node_modules/js-yaml": {
@@ -3789,6 +3901,18 @@
"node": ">=6"
}
},
"node_modules/kapsule": {
"version": "1.16.3",
"resolved": "https://registry.npmjs.org/kapsule/-/kapsule-1.16.3.tgz",
"integrity": "sha512-4+5mNNf4vZDSwPhKprKwz3330iisPrb08JyMgbsdFrimBCKNHecua/WBwvVg3n7vwx0C1ARjfhwIpbrbd9n5wg==",
"license": "MIT",
"dependencies": {
"lodash-es": "4"
},
"engines": {
"node": ">=12"
}
},
"node_modules/keyv": {
"version": "4.5.4",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
@@ -3835,6 +3959,12 @@
"integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==",
"license": "MIT"
},
"node_modules/lodash-es": {
"version": "4.17.23",
"resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz",
"integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==",
"license": "MIT"
},
"node_modules/lodash.debounce": {
"version": "4.0.8",
"resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz",
@@ -3848,6 +3978,18 @@
"dev": true,
"license": "MIT"
},
"node_modules/loose-envify": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
"integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
"license": "MIT",
"dependencies": {
"js-tokens": "^3.0.0 || ^4.0.0"
},
"bin": {
"loose-envify": "cli.js"
}
},
"node_modules/lru-cache": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
@@ -3934,6 +4076,44 @@
"dev": true,
"license": "MIT"
},
"node_modules/ngraph.events": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/ngraph.events/-/ngraph.events-1.4.0.tgz",
"integrity": "sha512-NeDGI4DSyjBNBRtA86222JoYietsmCXbs8CEB0dZ51Xeh4lhVl1y3wpWLumczvnha8sFQIW4E0vvVWwgmX2mGw==",
"license": "BSD-3-Clause"
},
"node_modules/ngraph.forcelayout": {
"version": "3.3.1",
"resolved": "https://registry.npmjs.org/ngraph.forcelayout/-/ngraph.forcelayout-3.3.1.tgz",
"integrity": "sha512-MKBuEh1wujyQHFTW57y5vd/uuEOK0XfXYxm3lC7kktjJLRdt/KEKEknyOlc6tjXflqBKEuYBBcu7Ax5VY+S6aw==",
"license": "BSD-3-Clause",
"dependencies": {
"ngraph.events": "^1.0.0",
"ngraph.merge": "^1.0.0",
"ngraph.random": "^1.0.0"
}
},
"node_modules/ngraph.graph": {
"version": "20.1.2",
"resolved": "https://registry.npmjs.org/ngraph.graph/-/ngraph.graph-20.1.2.tgz",
"integrity": "sha512-W/G3GBR3Y5UxMLHTUCPP9v+pbtpzwuAEIqP5oZV+9IwgxAIEZwh+Foc60iPc1idlnK7Zxu0p3puxAyNmDvBd0Q==",
"license": "BSD-3-Clause",
"dependencies": {
"ngraph.events": "^1.4.0"
}
},
"node_modules/ngraph.merge": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/ngraph.merge/-/ngraph.merge-1.0.0.tgz",
"integrity": "sha512-5J8YjGITUJeapsomtTALYsw7rFveYkM+lBj3QiYZ79EymQcuri65Nw3knQtFxQBU1r5iOaVRXrSwMENUPK62Vg==",
"license": "MIT"
},
"node_modules/ngraph.random": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/ngraph.random/-/ngraph.random-1.2.0.tgz",
"integrity": "sha512-4EUeAGbB2HWX9njd6bP6tciN6ByJfoaAvmVL9QTaZSeXrW46eNGA9GajiXiPBbvFqxUWFkEbyo6x5qsACUuVfA==",
"license": "BSD-3-Clause"
},
"node_modules/node-releases": {
"version": "2.0.27",
"resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
@@ -3941,6 +4121,15 @@
"dev": true,
"license": "MIT"
},
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
"integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/optionator": {
"version": "0.9.4",
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4044,6 +4233,18 @@
"url": "https://github.com/sponsors/jonschlinkert"
}
},
"node_modules/polished": {
"version": "4.3.1",
"resolved": "https://registry.npmjs.org/polished/-/polished-4.3.1.tgz",
"integrity": "sha512-OBatVyC/N7SCW/FaDHrSd+vn0o5cS855TOmYi4OkdWUMSJCET/xip//ch8xGUvtr3i44X9LVyWwQlRMTN3pwSA==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.17.8"
},
"engines": {
"node": ">=10"
}
},
"node_modules/postcss": {
"version": "8.5.6",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
@@ -4073,6 +4274,16 @@
"node": "^10 || ^12 || >=14"
}
},
"node_modules/preact": {
"version": "10.28.3",
"resolved": "https://registry.npmjs.org/preact/-/preact-10.28.3.tgz",
"integrity": "sha512-tCmoRkPQLpBeWzpmbhryairGnhW9tKV6c6gr/w+RhoRoKEJwsjzipwp//1oCpGPOchvSLaAPlpcJi9MwMmoPyA==",
"license": "MIT",
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/preact"
}
},
"node_modules/prelude-ls": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
@@ -4083,6 +4294,23 @@
"node": ">= 0.8.0"
}
},
"node_modules/prop-types": {
"version": "15.8.1",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
"integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
"license": "MIT",
"dependencies": {
"loose-envify": "^1.4.0",
"object-assign": "^4.1.1",
"react-is": "^16.13.1"
}
},
"node_modules/prop-types/node_modules/react-is": {
"version": "16.13.1",
"resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
"integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
"license": "MIT"
},
"node_modules/proxy-from-env": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
@@ -4120,6 +4348,23 @@
"react": "^19.2.4"
}
},
"node_modules/react-force-graph-3d": {
"version": "1.29.1",
"resolved": "https://registry.npmjs.org/react-force-graph-3d/-/react-force-graph-3d-1.29.1.tgz",
"integrity": "sha512-5Vp+PGpYnO+zLwgK2NvNqdXHvsWLrFzpDfJW1vUA1twjo9SPvXqfUYQrnRmAbD+K2tOxkZw1BkbH31l5b4TWHg==",
"license": "MIT",
"dependencies": {
"3d-force-graph": "^1.79",
"prop-types": "15",
"react-kapsule": "^2.5"
},
"engines": {
"node": ">=12"
},
"peerDependencies": {
"react": "*"
}
},
"node_modules/react-is": {
"version": "19.2.4",
"resolved": "https://registry.npmjs.org/react-is/-/react-is-19.2.4.tgz",
@@ -4127,6 +4372,21 @@
"license": "MIT",
"peer": true
},
"node_modules/react-kapsule": {
"version": "2.5.7",
"resolved": "https://registry.npmjs.org/react-kapsule/-/react-kapsule-2.5.7.tgz",
"integrity": "sha512-kifAF4ZPD77qZKc4CKLmozq6GY1sBzPEJTIJb0wWFK6HsePJatK3jXplZn2eeAt3x67CDozgi7/rO8fNQ/AL7A==",
"license": "MIT",
"dependencies": {
"jerrypick": "^1.1.1"
},
"engines": {
"node": ">=12"
},
"peerDependencies": {
"react": ">=16.13.1"
}
},
"node_modules/react-redux": {
"version": "9.2.0",
"resolved": "https://registry.npmjs.org/react-redux/-/react-redux-9.2.0.tgz",
@@ -4413,12 +4673,67 @@
"integrity": "sha512-05PUHKSNE8ou2dwIxTngl4EzcnsCDZGJ/iCLtDflR/SHB/ny14rXc+qU5P4mG9JkusiV7EivzY9Mhm55AzAvCg==",
"license": "MIT"
},
"node_modules/three": {
"version": "0.182.0",
"resolved": "https://registry.npmjs.org/three/-/three-0.182.0.tgz",
"integrity": "sha512-GbHabT+Irv+ihI1/f5kIIsZ+Ef9Sl5A1Y7imvS5RQjWgtTPfPnZ43JmlYI7NtCRDK9zir20lQpfg8/9Yd02OvQ==",
"license": "MIT"
},
"node_modules/three-forcegraph": {
"version": "1.43.1",
"resolved": "https://registry.npmjs.org/three-forcegraph/-/three-forcegraph-1.43.1.tgz",
"integrity": "sha512-lQnYPLvR31gb91mF5xHhU0jPHJgBPw9QB23R6poCk8Tgvz8sQtq7wTxwClcPdfKCBbHXsb7FSqK06Osiu1kQ5A==",
"license": "MIT",
"dependencies": {
"accessor-fn": "1",
"d3-array": "1 - 3",
"d3-force-3d": "2 - 3",
"d3-scale": "1 - 4",
"d3-scale-chromatic": "1 - 3",
"data-bind-mapper": "1",
"kapsule": "^1.16",
"ngraph.forcelayout": "3",
"ngraph.graph": "20",
"tinycolor2": "1"
},
"engines": {
"node": ">=12"
},
"peerDependencies": {
"three": ">=0.118.3"
}
},
"node_modules/three-render-objects": {
"version": "1.40.4",
"resolved": "https://registry.npmjs.org/three-render-objects/-/three-render-objects-1.40.4.tgz",
"integrity": "sha512-Ukpu1pei3L5r809izvjsZxwuRcYLiyn6Uvy3lZ9bpMTdvj3i6PeX6w++/hs2ZS3KnEzGjb6YvTvh4UQuwHTDJg==",
"license": "MIT",
"dependencies": {
"@tweenjs/tween.js": "18 - 25",
"accessor-fn": "1",
"float-tooltip": "^1.7",
"kapsule": "^1.16",
"polished": "4"
},
"engines": {
"node": ">=12"
},
"peerDependencies": {
"three": ">=0.168"
}
},
"node_modules/tiny-invariant": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
"integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==",
"license": "MIT"
},
"node_modules/tinycolor2": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
"integrity": "sha512-XPaBkWQJdsf3pLKJV9p4qN/S+fm2Oj8AIPo1BTUhg5oxkvm9+SVEGFdhyOz7tTdUTfvxMiAs4sp6/eZO2Ew+pw==",
"license": "MIT"
},
"node_modules/tinyglobby": {
"version": "0.2.15",
"resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",

View File

@@ -17,6 +17,7 @@
"headlessui": "^0.0.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
"react-force-graph-3d": "^1.29.1",
"react-router-dom": "^7.13.0",
"recharts": "^3.7.0"
},

View File

@@ -0,0 +1,61 @@
import ForceGraph3D from "react-force-graph-3d";
import {
type UserAnalysisResponse,
type InteractionGraph
} from '../types/ApiTypes';
import StatsStyling from "../styles/stats_styling";
const styles = StatsStyling;
function ApiToGraphData(apiData: InteractionGraph) {
const nodes = Object.keys(apiData).map(username => ({ id: username }));
const links = [];
for (const [source, targets] of Object.entries(apiData)) {
for (const [target, count] of Object.entries(targets)) {
links.push({ source, target, value: count });
}
}
// drop low-value and deleted interactions to reduce clutter
const filteredLinks = links.filter(link =>
link.value >= 2 &&
link.source !== "[deleted]" &&
link.target !== "[deleted]"
);
// also filter out nodes that are no longer connected after link filtering
const connectedNodeIds = new Set(filteredLinks.flatMap(link => [link.source, link.target]));
const filteredNodes = nodes.filter(node => connectedNodeIds.has(node.id));
return { nodes: filteredNodes, links: filteredLinks};
}
const UserStats = (props: { data: UserAnalysisResponse }) => {
const graphData = ApiToGraphData(props.data.interaction_graph);
return (
<div style={styles.page}>
<h2 style={styles.sectionTitle}>User Interaction Graph</h2>
<p style={styles.sectionSubtitle}>
This graph visualizes interactions between users based on comments and replies.
Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
</p>
<div>
<ForceGraph3D
graphData={graphData}
nodeAutoColorBy="id"
linkDirectionalParticles={2}
linkDirectionalParticleSpeed={0.005}
linkWidth={(link) => Math.sqrt(link.value)}
nodeLabel={(node) => `${node.id}`}
/>
</div>
</div>
);
}
export default UserStats;

View File

@@ -3,6 +3,7 @@ import axios from "axios";
import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats";
import InteractionStats from "../components/UserStats";
import {
type SummaryResponse,
@@ -16,7 +17,7 @@ const styles = StatsStyling;
const StatPage = () => {
const [error, setError] = useState('');
const [loading, setLoading] = useState(false);
const [activeView, setActiveView] = useState<"summary" | "emotional">("summary");
const [activeView, setActiveView] = useState<"summary" | "emotional" | "interaction">("summary");
const [userData, setUserData] = useState<UserAnalysisResponse | null>(null);
const [timeData, setTimeData] = useState<TimeAnalysisResponse | null>(null);
@@ -133,6 +134,13 @@ return (
>
Emotional
</button>
<button
onClick={() => setActiveView("interaction")}
style={activeView === "interaction" ? styles.buttonPrimary : styles.buttonSecondary}
>
Interaction
</button>
</div>
{activeView === "summary" && (
@@ -154,6 +162,10 @@ return (
</div>
)}
{activeView === "interaction" && userData && (
<InteractionStats data={userData} />
)}
</div>
);
}

View File

@@ -35,9 +35,12 @@ type User = {
vocab?: Vocab | null;
};
type InteractionGraph = Record<string, Record<string, number>>;
type UserAnalysisResponse = {
top_users: TopUser[];
users: User[];
interaction_graph: InteractionGraph;
};
// Time Analysis
@@ -89,6 +92,7 @@ export type {
TopUser,
Vocab,
User,
InteractionGraph,
UserAnalysisResponse,
FrequencyWord,
AverageEmotionByTopic,

View File

@@ -0,0 +1,41 @@
import pandas as pd
class EmotionalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def avg_emotion_by_topic(self) -> dict:
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
counts = (
self.df[
(self.df["topic"] != "Misc")
]
.groupby("topic")
.size()
.rename("n")
)
avg_emotion_by_topic = (
self.df[
(self.df["topic"] != "Misc")
]
.groupby("topic")[emotion_cols]
.mean()
.reset_index()
)
avg_emotion_by_topic = avg_emotion_by_topic.merge(
counts,
on="topic"
)
return avg_emotion_by_topic.to_dict(orient='records')

View File

@@ -0,0 +1,208 @@
import pandas as pd
import re
from collections import Counter
class InteractionAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list:
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append({
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words
})
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
def top_users(self) -> list:
counts = (
self.df.groupby(["author", "source"])
.size()
.sort_values(ascending=False)
)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
return top_users
def per_user_analysis(self) -> dict:
per_user = (
self.df.groupby(["author", "type"])
.size()
.unstack(fill_value=0)
)
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1)
per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user()
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append({
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"vocab": vocab_by_author.get(author)
})
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users
def interaction_graph(self):
interactions = {a: {} for a in self.df["author"].dropna().unique()}
# reply_to refers to the comment id, this allows us to map comment ids to usernames
id_to_author = self.df.set_index("id")["author"].to_dict()
for _, row in self.df.iterrows():
a = row["author"]
reply_id = row["reply_to"]
if pd.isna(a) or pd.isna(reply_id) or reply_id == "":
continue
b = id_to_author.get(reply_id)
if b is None or a == b:
continue
interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions
def average_thread_depth(self):
depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows():
depth = 0
current_id = row["id"]
while True:
reply_to = id_to_reply.get(current_id)
if pd.isna(reply_to) or reply_to == "":
break
depth += 1
current_id = reply_to
depths.append(depth)
if not depths:
return 0
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
length_cache = {}
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
seen = set()
length = 1
current = start_id
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
break
length += 1
current = reply_to
if current in length_cache:
length += (length_cache[current] - 1)
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
}

View File

@@ -0,0 +1,113 @@
import pandas as pd
import re
from collections import Counter
from itertools import islice
class LinguisticAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text)
text = re.sub(r"&\w+;", "", text) # remove HTML entities
text = re.sub(r"\bamp\b", "", text) # remove stray amp
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text
def word_frequencies(self, limit: int = 100) -> dict:
texts = (
self.df["content"]
.dropna()
.astype(str)
.str.lower()
)
words = []
for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
words.extend(
w for w in tokens
if w not in self.word_exclusions
)
counts = Counter(words)
word_frequencies = (
pd.DataFrame(counts.items(), columns=["word", "count"])
.sort_values("count", ascending=False)
.head(limit)
.reset_index(drop=True)
)
return word_frequencies.to_dict(orient="records")
def ngrams(self, n=2, limit=100):
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
all_ngrams = []
for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
# stop word removal causes strange behaviors in ngrams
#tokens = [w for w in tokens if w not in self.word_exclusions]
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
all_ngrams.extend([" ".join(ng) for ng in ngrams])
counts = Counter(all_ngrams)
return (
pd.DataFrame(counts.items(), columns=["ngram", "count"])
.sort_values("count", ascending=False)
.head(limit)
.to_dict(orient="records")
)
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -9,6 +9,7 @@ from sentence_transformers import SentenceTransformer
class NLP:
_topic_models: dict[str, SentenceTransformer] = {}
_emotion_classifiers: dict[str, Any] = {}
_entity_recognizers: dict[str, Any] = {}
_topic_embedding_cache: dict[tuple[str, ...], np.ndarray] = {}
def __init__(
@@ -29,6 +30,9 @@ class NLP:
self.emotion_classifier = self._get_emotion_classifier(
self.device_str, self.pipeline_device
)
self.entity_recognizer = self._get_entity_recognizer(
self.device_str, self.pipeline_device
)
except RuntimeError as exc:
if self.use_cuda and "out of memory" in str(exc).lower():
torch.cuda.empty_cache()
@@ -86,6 +90,27 @@ class NLP:
)
cls._emotion_classifiers[device_str] = classifier
return classifier
@classmethod
def _get_entity_recognizer(cls, device_str: str, pipeline_device: int) -> Any:
recognizer = cls._entity_recognizers.get(device_str)
if recognizer is None:
pipeline_kwargs = {
"aggregation_strategy": "simple", # merges subwords
"device": pipeline_device,
}
if device_str == "cuda":
pipeline_kwargs["dtype"] = torch.float16
recognizer = pipeline(
"token-classification",
model="dslim/bert-base-NER",
**pipeline_kwargs,
)
cls._entity_recognizers[device_str] = recognizer
return recognizer
def _encode_with_backoff(
self, texts: list[str], initial_batch_size: int
@@ -129,6 +154,26 @@ class NLP:
continue
raise
def _infer_entities_with_backoff(
self, texts: list[str], initial_batch_size: int
) -> list[list[dict[str, Any]]]:
batch_size = initial_batch_size
while True:
try:
return self.entity_recognizer(texts, batch_size=batch_size)
except RuntimeError as exc:
if (
self.use_cuda
and "out of memory" in str(exc).lower()
and batch_size > 4
):
batch_size = max(4, batch_size // 2)
torch.cuda.empty_cache()
continue
raise
def add_emotion_cols(self) -> None:
texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()
@@ -183,3 +228,51 @@ class NLP:
self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = (
"Misc"
)
def add_ner_cols(self, max_chars: int = 512) -> None:
texts = (
self.df[self.content_col]
.fillna("")
.astype(str)
.str.slice(0, max_chars)
.tolist()
)
if not texts:
self.df["entities"] = []
self.df["entity_counts"] = []
return
results = self._infer_entities_with_backoff(texts, 32 if self.use_cuda else 8)
entity_lists = []
entity_count_dicts = []
for row in results:
entities = []
counts = {}
for ent in row:
word = ent.get("word")
label = ent.get("entity_group")
if isinstance(word, str) and isinstance(label, str):
entities.append({"text": word, "label": label})
counts[label] = counts.get(label, 0) + 1
entity_lists.append(entities)
entity_count_dicts.append(counts)
self.df["entities"] = entity_lists
self.df["entity_counts"] = entity_count_dicts
# Expand label counts into columns
all_labels = set()
for d in entity_count_dicts:
all_labels.update(d.keys())
for label in all_labels:
col_name = f"entity_{label}"
self.df[col_name] = [
d.get(label, 0) for d in entity_count_dicts
]

View File

@@ -0,0 +1,70 @@
import pandas as pd
class TemporalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def avg_reply_time_per_emotion(self) -> dict:
df = self.df.copy()
replies = df[
(df["type"] == "comment") &
(df["reply_to"].notna()) &
(df["reply_to"] != "")
]
id_to_time = df.set_index("id")["dt"].to_dict()
def compute_reply_time(row):
reply_id = row["reply_to"]
parent_time = id_to_time.get(reply_id)
if parent_time is None:
return None
return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = (
replies
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"])
.reset_index()
)
return grouped.to_dict(orient="records")
def posts_per_day(self) -> dict:
per_day = (
self.df.groupby("date")
.size()
.reset_index(name="count")
)
return per_day.to_dict(orient="records")
def heatmap(self) -> dict:
weekday_order = [
"Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"
]
self.df["weekday"] = pd.Categorical(
self.df["weekday"],
categories=weekday_order,
ordered=True
)
heatmap = (
self.df
.groupby(["weekday", "hour"], observed=True)
.size()
.unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0)
)
heatmap.columns = heatmap.columns.map(str)
return heatmap.to_dict(orient="records")

View File

@@ -12,7 +12,7 @@ app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
# Global State
posts_df = pd.read_json('posts.jsonl', lines=True)
posts_df = pd.read_json('small.jsonl', lines=True)
with open("topic_buckets.json", "r", encoding="utf-8") as f:
domain_topics = json.load(f)
stat_obj = StatGen(posts_df, domain_topics)
@@ -47,7 +47,7 @@ def get_dataset():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
return jsonify(stat_obj.df.to_dict(orient="records")), 200
return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"}
@app.route('/stats/content', methods=['GET'])
def word_frequencies():
@@ -55,7 +55,7 @@ def word_frequencies():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.content_analysis()), 200
return jsonify(stat_obj.get_content_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
@@ -80,7 +80,7 @@ def get_time_analysis():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.time_analysis()), 200
return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
@@ -93,13 +93,39 @@ def get_user_analysis():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.user_analysis()), 200
return jsonify(stat_obj.get_user_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_cultural_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_interactional_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/search', methods=["POST"])
def search_dataset():
if stat_obj is None:

View File

@@ -1,11 +1,13 @@
import pandas as pd
import re
import nltk
import datetime
import nltk
from nltk.corpus import stopwords
from collections import Counter
from server.nlp import NLP
from server.analysis.nlp import NLP
from server.analysis.temporal import TemporalAnalysis
from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis
DOMAIN_STOPWORDS = {
"www", "https", "http",
@@ -23,6 +25,7 @@ EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
class StatGen:
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
comments_df = df[["id", "comments"]].explode("comments")
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
comments_df = pd.json_normalize(comments_df["comments"])
posts_df = df.drop(columns=["comments"])
@@ -35,9 +38,15 @@ class StatGen:
self.df = pd.concat([posts_df, comments_df])
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics)
self._add_extra_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df)
self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.original_df = self.df.copy(deep=True)
## Private Methods
@@ -50,141 +59,52 @@ class StatGen:
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in EXCLUDE_WORDS]
def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list:
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append({
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words
})
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
self.nlp.add_ner_cols()
def _interaction_graph(self):
interactions = {a: {} for a in self.df["author"].dropna().unique()}
# reply_to refers to the comment id, this allows us to map comment ids to usernames
id_to_author = self.df.set_index("id")["author"].to_dict()
for _, row in self.df.iterrows():
a = row["author"]
reply_id = row["reply_to"]
if pd.isna(a) or pd.isna(reply_id) or reply_id == "":
continue
b = id_to_author.get(reply_id)
if b is None or a == b:
continue
interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions
def _avg_reply_time_per_emotion(self):
df = self.df.copy()
replies = df[
(df["type"] == "comment") &
(df["reply_to"].notna()) &
(df["reply_to"] != "")
]
id_to_time = df.set_index("id")["dt"].to_dict()
def compute_reply_time(row):
reply_id = row["reply_to"]
parent_time = id_to_time.get(reply_id)
if parent_time is None:
return None
return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = (
replies
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"])
.reset_index()
)
return grouped.to_dict(orient="records")
## Public
def time_analysis(self) -> pd.DataFrame:
per_day = (
self.df.groupby("date")
.size()
.reset_index(name="count")
)
weekday_order = [
"Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"
]
self.df["weekday"] = pd.Categorical(
self.df["weekday"],
categories=weekday_order,
ordered=True
)
heatmap = (
self.df
.groupby(["weekday", "hour"], observed=True)
.size()
.unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0)
)
heatmap.columns = heatmap.columns.map(str)
burst_index = per_day["count"].std() / max(per_day["count"].mean(), 1)
# topics over time
# emotions over time
def get_time_analysis(self) -> pd.DataFrame:
return {
"events_per_day": per_day.to_dict(orient="records"),
"weekday_hour_heatmap": heatmap.to_dict(orient="records"),
"burstiness": round(burst_index, 2)
"events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
}
# average topic duration
def get_content_analysis(self) -> dict:
return {
"word_frequencies": self.linguistic_analysis.word_frequencies(),
"common_two_phrases": self.linguistic_analysis.ngrams(),
"common_three_phrases": self.linguistic_analysis.ngrams(n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
}
# average emotion per user
# average chain length
def get_user_analysis(self) -> dict:
return {
"top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(),
"interaction_graph": self.interaction_analysis.interaction_graph()
}
# average / max thread depth
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
}
# detect community jargon
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return {
"identity_markers": self.linguistic_analysis.identity_markers()
}
def summary(self) -> dict:
@@ -206,122 +126,6 @@ class StatGen:
},
"sources": self.df["source"].dropna().unique().tolist()
}
def content_analysis(self, limit: int = 100) -> dict:
texts = (
self.df["content"]
.dropna()
.astype(str)
.str.lower()
)
words = []
for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
words.extend(
w for w in tokens
if w not in EXCLUDE_WORDS
)
counts = Counter(words)
word_frequencies = (
pd.DataFrame(counts.items(), columns=["word", "count"])
.sort_values("count", ascending=False)
.head(limit)
.reset_index(drop=True)
)
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
counts = (
self.df[
(self.df["topic"] != "Misc")
]
.groupby("topic")
.size()
.rename("n")
)
avg_emotion_by_topic = (
self.df[
(self.df["topic"] != "Misc")
]
.groupby("topic")[emotion_cols]
.mean()
.reset_index()
)
avg_emotion_by_topic = avg_emotion_by_topic.merge(
counts,
on="topic"
)
return {
"word_frequencies": word_frequencies.to_dict(orient='records'),
"average_emotion_by_topic": avg_emotion_by_topic.to_dict(orient='records'),
"reply_time_by_emotion": self._avg_reply_time_per_emotion()
}
def user_analysis(self) -> dict:
counts = (
self.df.groupby(["author", "source"])
.size()
.sort_values(ascending=False)
)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
per_user = (
self.df.groupby(["author", "type"])
.size()
.unstack(fill_value=0)
)
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1)
per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user()
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append({
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"vocab": vocab_by_author.get(author)
})
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return {
"top_users": top_users,
"users": merged_users,
"interaction_graph": self._interaction_graph()
}
def search(self, search_query: str) -> dict:
self.df = self.df[