Compare commits
7 Commits
0c4dc02852
...
a347869353
| Author | SHA1 | Date | |
|---|---|---|---|
| a347869353 | |||
| 8b4e13702e | |||
| 8fa4f3fbdf | |||
| c6cae040f0 | |||
| addc1d4087 | |||
| 225133a074 | |||
| e903e1b738 |
@@ -39,6 +39,21 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
|
||||
return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
|
||||
};
|
||||
|
||||
const stanceSublabel = (
|
||||
per1kTokens: number | undefined,
|
||||
emotionAvg: Record<string, number> | undefined,
|
||||
) => {
|
||||
const rateLabel =
|
||||
typeof per1kTokens === "number"
|
||||
? `${per1kTokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency";
|
||||
const emotionLabel = topEmotion(emotionAvg);
|
||||
|
||||
return emotionLabel === "—"
|
||||
? rateLabel
|
||||
: `${rateLabel} • Avg mood: ${emotionLabel}`;
|
||||
};
|
||||
|
||||
return (
|
||||
<div style={styles.page}>
|
||||
<div style={{ ...styles.container, ...styles.grid }}>
|
||||
@@ -107,41 +122,37 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
|
||||
<Card
|
||||
label="Hedging Words"
|
||||
value={stance?.hedge_total?.toLocaleString() ?? "—"}
|
||||
sublabel={
|
||||
typeof stance?.hedge_per_1k_tokens === "number"
|
||||
? `${stance.hedge_per_1k_tokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency"
|
||||
}
|
||||
sublabel={stanceSublabel(
|
||||
stance?.hedge_per_1k_tokens,
|
||||
stance?.hedge_emotion_avg,
|
||||
)}
|
||||
style={{ gridColumn: "span 3" }}
|
||||
/>
|
||||
<Card
|
||||
label="Certainty Words"
|
||||
value={stance?.certainty_total?.toLocaleString() ?? "—"}
|
||||
sublabel={
|
||||
typeof stance?.certainty_per_1k_tokens === "number"
|
||||
? `${stance.certainty_per_1k_tokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency"
|
||||
}
|
||||
sublabel={stanceSublabel(
|
||||
stance?.certainty_per_1k_tokens,
|
||||
stance?.certainty_emotion_avg,
|
||||
)}
|
||||
style={{ gridColumn: "span 3" }}
|
||||
/>
|
||||
<Card
|
||||
label="Need/Should Words"
|
||||
value={stance?.deontic_total?.toLocaleString() ?? "—"}
|
||||
sublabel={
|
||||
typeof stance?.deontic_per_1k_tokens === "number"
|
||||
? `${stance.deontic_per_1k_tokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency"
|
||||
}
|
||||
sublabel={stanceSublabel(
|
||||
stance?.deontic_per_1k_tokens,
|
||||
stance?.deontic_emotion_avg,
|
||||
)}
|
||||
style={{ gridColumn: "span 3" }}
|
||||
/>
|
||||
<Card
|
||||
label="Permission Words"
|
||||
value={stance?.permission_total?.toLocaleString() ?? "—"}
|
||||
sublabel={
|
||||
typeof stance?.permission_per_1k_tokens === "number"
|
||||
? `${stance.permission_per_1k_tokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency"
|
||||
}
|
||||
sublabel={stanceSublabel(
|
||||
stance?.permission_per_1k_tokens,
|
||||
stance?.permission_emotion_avg,
|
||||
)}
|
||||
style={{ gridColumn: "span 3" }}
|
||||
/>
|
||||
|
||||
|
||||
@@ -88,6 +88,15 @@ export default function UserModal({
|
||||
</div>
|
||||
</div>
|
||||
) : null}
|
||||
|
||||
{userData.dominant_topic ? (
|
||||
<div style={styles.topUserItem}>
|
||||
<div style={styles.topUserName}>Most Common Topic</div>
|
||||
<div style={styles.topUserMeta}>
|
||||
{userData.dominant_topic.topic} ({userData.dominant_topic.count} events)
|
||||
</div>
|
||||
</div>
|
||||
) : null}
|
||||
</div>
|
||||
)}
|
||||
</DialogPanel>
|
||||
|
||||
@@ -34,6 +34,11 @@ type Vocab = {
|
||||
top_words: FrequencyWord[];
|
||||
};
|
||||
|
||||
type DominantTopic = {
|
||||
topic: string;
|
||||
count: number;
|
||||
};
|
||||
|
||||
type User = {
|
||||
author: string;
|
||||
post: number;
|
||||
@@ -41,6 +46,7 @@ type User = {
|
||||
comment_post_ratio: number;
|
||||
comment_share: number;
|
||||
avg_emotions?: Record<string, number>;
|
||||
dominant_topic?: DominantTopic | null;
|
||||
vocab?: Vocab | null;
|
||||
};
|
||||
|
||||
@@ -162,6 +168,10 @@ type StanceMarkers = {
|
||||
certainty_per_1k_tokens: number;
|
||||
deontic_per_1k_tokens: number;
|
||||
permission_per_1k_tokens: number;
|
||||
hedge_emotion_avg?: Record<string, number>;
|
||||
certainty_emotion_avg?: Record<string, number>;
|
||||
deontic_emotion_avg?: Record<string, number>;
|
||||
permission_emotion_avg?: Record<string, number>;
|
||||
};
|
||||
|
||||
type EntityEmotionAggregate = {
|
||||
@@ -202,6 +212,7 @@ type FilterResponse = {
|
||||
|
||||
export type {
|
||||
TopUser,
|
||||
DominantTopic,
|
||||
Vocab,
|
||||
User,
|
||||
InteractionGraph,
|
||||
|
||||
BIN
report/img/reddit_bot.png
Normal file
BIN
report/img/reddit_bot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 232 KiB |
BIN
report/img/ucc_crest.png
Normal file
BIN
report/img/ucc_crest.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 27 KiB |
224
report/main.tex
224
report/main.tex
@@ -8,27 +8,39 @@
|
||||
|
||||
\begin{titlepage}
|
||||
\centering
|
||||
\vspace*{3cm}
|
||||
|
||||
{\Huge \textbf{Web-Based Tool for Observing and Analysing Online Communities} \par}
|
||||
|
||||
% UCC Crest (clickable link)
|
||||
\vspace*{1.5cm}
|
||||
\href{https://www.ucc.ie/en/}{%
|
||||
\includegraphics[width=0.25\textwidth]{img/ucc_crest.png}
|
||||
}
|
||||
|
||||
\vspace{2cm}
|
||||
|
||||
% Title
|
||||
{\Huge\bfseries Web-Based Tool for Observing\\[0.3cm]
|
||||
and Analysing Online Communities \par}
|
||||
|
||||
\vspace{2.5cm}
|
||||
|
||||
% Author
|
||||
{\Large Dylan De Faoite \par}
|
||||
|
||||
\vspace{0.5cm}
|
||||
|
||||
% Date
|
||||
{\large April 2026 \par}
|
||||
|
||||
\vfill
|
||||
|
||||
% Degree + Institution
|
||||
{\large
|
||||
Bachelor of Science in Computer Science \\
|
||||
University College Cork \\
|
||||
Bachelor of Science in Computer Science \\[0.2cm]
|
||||
University College Cork \\[0.2cm]
|
||||
Supervisor: Paolo Palmeiri
|
||||
\par}
|
||||
|
||||
\vspace{2cm}
|
||||
\vspace{1.5cm}
|
||||
|
||||
\end{titlepage}
|
||||
|
||||
@@ -101,9 +113,9 @@ This section describes common keywords and metrics use to measure and quantify o
|
||||
Sentiment Analysis involves capturing the emotions associated with a specific post, topic or entity. This type of analysis can be as simple as classifying a post as "positive" or "negative", or classifying a post into a set of pre-existing emotions such as anger, joy or sadness.
|
||||
|
||||
\subsubsection{Active vs Passive Participation}
|
||||
Not everyone in an online community participates in the same way. Some users post regularly, leave comments, and interact with others, while many more simply read content without ever contributing anything themselves. Some might only contribute occasionally.
|
||||
Not everyone in an online community participates in the same way. Some users post regularly and leave comments while others might simply read content without ever contributing anything themselves. Some might only contribute occasionally.
|
||||
|
||||
This distinction between active and passive participation (passive users are often referred to as "lurkers") is an important one in digital ethnography, because looking only at posts and comments can give a misleading picture of how large or engaged a community actually is.
|
||||
This distinction between active and passive participation (passive users are often referred to as "lurkers") is important in digital ethnography, because looking only at posts and comments can give a misleading picture of how large or engaged a community actually is.
|
||||
|
||||
\subsubsection{Temporal Activity Patterns}
|
||||
Looking at when a community is active can reveal quite a lot about its nature and membership. A subreddit that peaks at 2am UTC might have a mostly American userbase, while one that is consistently active across all hours could suggest a more globally distributed community. Beyond timezones, temporal patterns can also capture things like how a community responds to external events, like a sudden spike in posting activity often corresponds to something newsworthy happening that is relevant to the community.
|
||||
@@ -111,6 +123,19 @@ Looking at when a community is active can reveal quite a lot about its nature an
|
||||
\subsubsection{Cultural Markers}
|
||||
Cultural markers are the words, phrases, memes, and behaviours that are specific to a particular community and signal that someone is a member of it. These might include in-jokes, niche slang, recurring references, or even particular ways of formatting posts. In the context of digital ethnography, identifying these markers is useful because they reveal how communities build a shared identity and distinguish themselves from outsiders.
|
||||
|
||||
Some patterns, such as usage of words like "we, us, our, ourselves", where posts are referring to themselves as a community might have different sentiment to posts where words like "they, them, their, themselves" are used. These are known as "identity markers" and they can be used to identify how welcoming a community might be to outsiders.
|
||||
|
||||
\subsubsection{Stance Markers}
|
||||
Stance Markers refer to the usage of different phrasing patterns which can reveal the speakers attitude towards topics. There are different kinds of these phrasings, such as hedge, certainty, deontic and permission patterns.
|
||||
|
||||
\textbf{Hedge Patterns} are usually phrases that contain words like "maybe, possibly, probably, i think, i feel" and generally mean that someone is unsure or suspicious about something.
|
||||
|
||||
\textbf{Certainty Patterns} contain phrases like "definitely, certainly, clearly, obviously" and as the name suggests, imply certainty or assuredness.
|
||||
|
||||
\textbf{Deontic Patterns} contains phrases that imply obligation, such as "must, should, need, have to". In the context of online communities, these patterns are often used to assert authority or to reinforce communal norms and "unwritten rules."
|
||||
|
||||
\textbf{Permission Patterns} refer to phrases where someone is asking permision, like "can, allowed, ok, permitted". These patterns could serve as an indicator of a user's status within an online community.
|
||||
|
||||
\subsection{Natural Language Processing}
|
||||
\textbf{Natural Language Processing} is a branch of artificial intelligence that allows machines to interpret, analyse and generate human language. The aim of NLP models is not only to understand single words individually, but to be able to understand the context of those words in a broader paragraph or story.
|
||||
|
||||
@@ -132,6 +157,11 @@ NLP techniques can be used to automatically process and analyse large volumes an
|
||||
|
||||
This method is often used to organise lots of unstructured data, such as news articles, research papers, or social media posts.
|
||||
|
||||
\subsubsection{Stop Words}
|
||||
\textbf{Stop Words} are common words that are often filtered out in NLP tasks because they carry little meaningful information. Examples of stop words include "the", "is", "in", "and", etc. Removing stop words can help improve the performance of NLP models by reducing noise and focusing on more informative words. However, the choice of stop words can vary depending on the context and the specific task at hand.
|
||||
|
||||
For example, in a Cork-specific dataset, words like "ah", or "grand" might be considered stop words, as they are commonly used in everyday speech but do not carry significant meaning for analysis.
|
||||
|
||||
\subsection{Limits of Computation Analysis}
|
||||
While computational methods enable large-scale observation and analysis of online communities, there are many limitations that must be acknowledged. Many limitations come from NLP techniques and the practical boundaries of computational resources.
|
||||
|
||||
@@ -158,7 +188,7 @@ The dataset is drawn from four distinct online platforms, each of which represen
|
||||
|
||||
Reddit's hierarchical comment threading enables deep conversational analysis and reply-chain metrics, whereas YouTube comments are largely flat and unthreaded. Boards.ie occupies a middle ground, with linear threads but a more intimate community character. Taken together, the four sources offer variation in interaction structure, community age, demographic composition, and linguistic register, all of which are factors that the system's analytical modules are designed to detect and compare.
|
||||
|
||||
Collecting data across multiple platforms also introduces the challenge of normalisation. Posts, comments, and metadata fields differ in schema and semantics across sources. A core design requirement of the system is the normalisation of these inputs into a unified event-based internal representation, allowing the same analytical pipeline to operate uniformly regardless of the source.
|
||||
Due to data being collected across multiple platforms, they must be normalised into a single data model. Posts, comments, and metadata fields differ in schema and semantics across sources. A core design requirement of the system is the normalisation of these inputs into a unified event-based internal representation, allowing the same analytical pipeline to operate uniformly regardless of the source.
|
||||
|
||||
\newpage
|
||||
\section{Analysis}
|
||||
@@ -279,7 +309,7 @@ All data fetched from social media sites are stored locally in a PostgreSQL data
|
||||
|
||||
All datasets are associated with one and only one user account, and the users themselves are responsible for uploading or fetching the data, analysing the data, and deleting the data when they are done. The system will not retain any data beyond what is necessary for the end-user to carry out their analysis, and users will have the option to delete their datasets at any time.
|
||||
|
||||
The system will not store any personally identifiable information beyond what is necessary for the analysis, which includes only usernames and timestamps. The system will not attempt to de-anonymise content creators or link data across platforms.
|
||||
The system will not store any personally identifiable information except for what is necessary for the analysis, which includes only usernames and timestamps. The system will not attempt to de-anonymise content creators or link data across platforms.
|
||||
|
||||
\subsubsection{User Security}
|
||||
Standard security practices will be followed to protect user data and prevent unauthorized access. This includes:
|
||||
@@ -409,22 +439,18 @@ The following requirements are derived from the backend architecture, NLP proces
|
||||
\label{fig:schema}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Ethnographic Analysis}
|
||||
Ethnographic analysis can be carried out from many different perspectives, such as the perspective of a single user or the community as a whole. The system is designed to support both of these perspectives, as well as the ability to zoom in and out between them. For example, a researcher might want to look at the overall emotional tone of a community, but then zoom in to see how a specific user contributes to that tone.
|
||||
\subsection{Client-Server Architecture}
|
||||
The system will follow a client-server architecture, with a Flask-based backend API and a React-based frontend interface. The backend will handle data processing, NLP analysis, and database interactions, while the frontend will provide an interactive user interface for data exploration and visualization.
|
||||
|
||||
The system is designed to support multiple types of analysis, such as:
|
||||
\begin{itemize}
|
||||
\item \textbf{Temporal Analysis}: looking at when a community is active and how that activity changes over time.
|
||||
\item \textbf{Linguistic Analysis}: looking at the words and phrases that are commonly used in a community, and how they relate to identity and culture.
|
||||
\item \textbf{Emotional Analysis}: looking at the emotional tone of a community, and how it varies across different topics or users.
|
||||
\item \textbf{User Analysis}: looking at the behaviour and activity of individual users, and how they contribute to the community.
|
||||
\item \textbf{Interaction Analysis}: looking at how users interact with each other, such as who replies to whom and how conversations develop.
|
||||
\item \textbf{Cultural Analysis}: looking at the cultural markers and identity signals that are present in a community, such as slang, memes, and recurring references.
|
||||
\end{itemize}
|
||||
The reasoning behind this architecture is that it allows the analytics to be aggregated and computed on the server side using Pandas which is much faster than doing it on the client frontend. The frontend will focus on rendering and visualising the data.
|
||||
|
||||
Each of these types of analysis are available at different API endpoints for any given dataset, and the frontend is designed to allow users to easily switch between them and explore the data from different angles.
|
||||
\subsubsection{Flask API}
|
||||
The Flask backend will expose a RESTful API with endpoints for dataset management, authentication and user management, and analytical queries. Flask will call on backend components for data parsing, normalisation, NLP processing and database interfacing.
|
||||
|
||||
Flask was chosen for its simplicity, familiarity and speed of development. It also has many extensions that can be used for authentication (Flask-Bcrypt, Flask-Login).
|
||||
|
||||
\subsubsection{React Frontend}
|
||||
React was chosen for the frontend due to its massive library of pre-built components with efficient rendering capabilities and ability to display many different types of data. The frontend will be structured around a tabbed interface, with each tab corresponding to a different analytical endpoint (e.g., temporal analysis, linguistic analysis, emotional analysis). Each tab will fetch data from the backend API and render it using appropriate visualisation libraries (react-wordcloud for word clouds, react-chartjs-2 for charts, etc). The frontend will also include controls for filtering the dataset based on keywords, date ranges, and data sources.
|
||||
|
||||
\subsection{Data Pipeline}
|
||||
As this project is focused on the collection and analysis of online community data, the primary component that must be well-designed is the data pipeline, which encompasses the processes of data ingestion, normalisation, enrichment, storage, and retrieval for analysis.
|
||||
@@ -485,7 +511,7 @@ After normalisation, the dataset is enriched with additional derived fields and
|
||||
\item \textbf{Named Entity Recognition}: Each event is processed to identify any named entities mentioned in the text, such as people, places, or organisations, which are stored as a list associated with the event.
|
||||
\end{itemize}
|
||||
|
||||
NLP processing allows for much richer analysis of the dataset, as it provides additional layers of information beyond just the raw text content. After enrichment, the dataset is ready to be stored in the database and made available for analysis through the API endpoints.
|
||||
NLP processing lets us perform much richer analysis of the dataset, as it provides additional layers of information beyond just the raw text content. After enrichment, the dataset is ready to be stored in the database and made available for analysis through the API endpoints.
|
||||
|
||||
\subsubsection{Data Storage}
|
||||
The enriched dataset is stored in a PostgreSQL database, with a schema similar to the unified data model defined in the normalisation section, with additional fields for the derived data, NLP outputs, and user ownership. Each dataset is associated with a specific user account, and the system supports multiple datasets per user.
|
||||
@@ -512,6 +538,124 @@ The \texttt{events} table in PostgreSQL contains the following fields:
|
||||
\subsubsection{Data Retrieval}
|
||||
The stored dataset can then be retrieved through the Flask API endpoints for analysis. The API supports filtering by keywords and date ranges, as well as grouping and aggregation for various analytical outputs.
|
||||
|
||||
\subsection{Ethnographic Analysis}
|
||||
The main goal of this project is to provide a tool that can assist researchers with ethnographic analysis of online communities. Therefore, ethnographic analysis will be a core component of the system.
|
||||
|
||||
Ethnographic analysis can be carried out from many different perspectives, such as the perspective of a single user or the community as a whole. The system is designed to support both of these perspectives, as well as the ability to zoom in and out between them. For example, a researcher might want to look at the overall emotional tone of a community, but then zoom in to see how a specific user contributes to that tone.
|
||||
|
||||
The system is designed to support multiple types of analysis, such as:
|
||||
\begin{itemize}
|
||||
\item \textbf{Temporal Analysis}: looking at when a community is active and how that activity changes over time.
|
||||
\item \textbf{Linguistic Analysis}: looking at the words and phrases that are commonly used in a community, and how they relate to identity and culture.
|
||||
\item \textbf{Emotional Analysis}: looking at the emotional tone of a community, and how it varies across different topics or users.
|
||||
\item \textbf{User Analysis}: looking at the behaviour and activity of individual users, and how they contribute to the community.
|
||||
\item \textbf{Interaction Analysis}: looking at how users interact with each other, such as who replies to whom and how conversations develop.
|
||||
\item \textbf{Cultural Analysis}: looking at the cultural markers and identity signals that are present in a community, such as slang, memes, and recurring references.
|
||||
\end{itemize}
|
||||
|
||||
Each of these types of analysis are available at different API endpoints for any given dataset, and the frontend is designed to allow users to easily switch between them and explore the data from different angles.
|
||||
|
||||
For each type of analysis that involves analysing the content of the posts themselves, they will be split into tokens and stop words will be stripped from them, which makes analysis easier.
|
||||
|
||||
\subsubsection{Temporal Analysis}
|
||||
Temporal analysis allows researchers to understand what a community is talking about over time, and how the emotional tone of the community changes over time. For example, a researcher might want to see how discussions around a specific topic evolve over time, or how the emotional tone of a community changes in response to external events.
|
||||
|
||||
However a major limitation of the data captured for this system, whether it's the Cork dataset, or any automatically fetched dataset, it will only stretch at most a few weeks back in time. This is because the system is designed to fetch only the most recent posts and comments from social media platforms, which means that it will not capture historical data beyond a certain point. Therefore, while temporal analysis can still be carried out on the dataset, it will be limited to a relatively short timeframe.
|
||||
|
||||
In this system, temporal analysis will be limited to:
|
||||
\begin{itemize}
|
||||
\item Event frequency per day.
|
||||
\item Weekday--hour heatmap data representing activity distribution.
|
||||
\end{itemize}
|
||||
|
||||
\textbf{Average reply time per emotion} was considered as a potential temporal analysis metric, but was eventually excluded due to inconsistent and statistically insignificant results that yielded no meaningful analytical insight.
|
||||
|
||||
\subsubsection{Linguistic Analysis}
|
||||
Linguistic analysis allows researchers to understand the language and words used in a community. For example, a researcher might want to see what words are most commonly used in a community, or how the language used in a community relates to identity and culture.
|
||||
|
||||
In this system, linguistic analysis will include:
|
||||
\begin{itemize}
|
||||
\item Word frequency statistics excluding standard and domain-specific stopwords.
|
||||
\item Common bi-grams and tri-grams from textual content.
|
||||
\item Lexical diversity metrics for the dataset.
|
||||
\end{itemize}
|
||||
|
||||
The word frequencies and n-gram metrics were chosen because they can provide insights into the language and phrases used commonly in an online community, which is important for ethnographic analysis and understanding a community fully. Lexical diversity metrics such as the total number of unique tokens versus the total number of tokens can show if a specific culture often repeats phrases (like memes, slang etc.) or if they often have structured, serious discussion without repeating themeselves.
|
||||
|
||||
Outlining a list of stopwords is essential for linguistic analysis, as it filters out common words that wouldn't be useful for linguistic analysis. Stop Word lists can be provided by a Python library such as NLTK.
|
||||
|
||||
In addition to standard stop words, the system also excludes link tokens such as "www", "http", and "https" from the word frequency analysis, as social media users will often include links in their posts and comments, and these tokens can become quite common and skew the word frequency results without adding meaningful insight.
|
||||
|
||||
\subsubsection{User Analysis}
|
||||
User analysis allows researchers to understand the behaviour and activity of individual users within a community. For example, a researcher might want to see who the most active users are in a community, or how different users contribute to the overall emotional tone of the community.
|
||||
|
||||
In this system, user analysis will include:
|
||||
\begin{itemize}
|
||||
\item Identification of top users based on activity.
|
||||
\item Per-user activity such as:
|
||||
\begin{itemize}
|
||||
\item Total number of events (posts and comments).
|
||||
\item Average emotion distribution across their events.
|
||||
\item Average topic distribution across their events.
|
||||
\item Comment-to-post ratio.
|
||||
\item Vocabulary information such as top words used and lexical diversity.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
Initially the user endpoint contained the interactional statistics as well, as a case could be made for the user analysis and interaction analysis being combined, however a distinction can be made between individual user analysis and user analysis on a larger, community-level scale focused on interactions. This allows the user endpoint to stay focused on singular user analysis while still using NLP outputs like emotions and topics.
|
||||
|
||||
Identifying top users allows us to see the most active and prolific posters in a community, which might often be site-specific bots that comment on every post or deleted users, which often show up as simply "[Deleted User]" and can aggregate together in statistics . An example might be a User Moderator bot on Reddit, seen below.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.75\textwidth]{img/reddit_bot.png}
|
||||
\caption{An AutoModerator Bot on r/politics}
|
||||
\label{fig:bot}
|
||||
\end{figure}
|
||||
|
||||
While it's impossible to filter out all of these bots, deleted users can simply be filtered out using an exclusion list.
|
||||
|
||||
\subsubsection{Interactional Analysis}
|
||||
Instead of per-user analysis, interactional analysis looks at the interactions between users, such as who replies to who and who is contributing the most to the conversations.
|
||||
|
||||
In this system, interactional analysis will include:
|
||||
\begin{itemize}
|
||||
\item Top interaction pairs between users.
|
||||
\item An interaction graph based on user relationships.
|
||||
\item Conversation concentration metrics such as who is contributing the most to the conversations and how much of the conversation is dominated by a small number of users.
|
||||
\end{itemize}
|
||||
|
||||
For simplicity, an interaction is defined as a reply from one user to another, which can be either a comment replying to a post or a comment replying to another comment. The system will not attempt to capture more complex interactions such as mentions or indirect references between users, as these would require more advanced NLP techniques.
|
||||
|
||||
\textbf{Average reply chain depth} was considered as a metric, however forum-based social media sites, such as boards.ie, do not have a way to reply to comments in the same way that Reddit does, therefore the concept of "reply chains" doesn't apply cleanly in the same way. One possible solution is to infer reply relationships from explicit user mentions embedded in content of the post, but this is not a reliable method.
|
||||
|
||||
\subsubsection{Emotional Analysis}
|
||||
Emotional analysis allows researchers to understand the emotional tone of a community, and how it varies across different topics and users.
|
||||
|
||||
In this system, emotional analysis will include:
|
||||
\begin{itemize}
|
||||
\item Average emotional by topic.
|
||||
\item Overall average emotional distribution across the dataset.
|
||||
\item Dominant emotion distributions for each event
|
||||
\item Average emotion by data source
|
||||
\end{itemize}
|
||||
|
||||
It is emphasised that emotional analysis is inaccurate on an individual post level as the models cannot fully capture the nuance of human interaction and slang. Warnings will be presented to the user in the frontend that AI outputs can possible be misleading on an individual scale, and accuracy only increases with more posts. Even then it will not be perfect.
|
||||
|
||||
In an ideal world, the models are accurate enough to capture general emotions on a macro-scale.
|
||||
|
||||
\subsubsection{Cultural Analysis}
|
||||
Cultural analysis allows researchers to understand the cultural markers and identity signals that are present in a community, such as slang, memes, and recurring references. While some of this is covered in the linguistic analysis, cultural analysis will focus more on the identity and stance-related markers that are present in the language of the community.
|
||||
|
||||
In this system, cultural analysis will include:
|
||||
\begin{itemize}
|
||||
\item In-Group vs Out-Group phrasing
|
||||
\item Average emotion for in-group vs out-group phrasing
|
||||
\item Stance Markers
|
||||
\item Average emotions per stance marker type
|
||||
\item Average emotions per entity
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Automatic Data Collection}
|
||||
Originally, the system was designed to only support manual dataset uploads, where users would collect their own data from social media platforms and format it into the required \texttt{.jsonl} format.
|
||||
|
||||
@@ -533,21 +677,15 @@ Creating a base interface for what a connector should look like allows for the e
|
||||
|
||||
The connector registry is designed so that any new connector implementing \texttt{BaseConnector} is automatically discovered and registered at runtime, without requiring changes to any existing code. This allows for a modular and extensible architecture where new data sources can be integrated with minimal effort.
|
||||
|
||||
\subsection{Client-Server Architecture}
|
||||
The system will follow a client-server architecture, with a Flask-based backend API and a React-based frontend interface. The backend will handle data processing, NLP analysis, and database interactions, while the frontend will provide an interactive user interface for data exploration and visualization.
|
||||
\subsection{Asynchronous Processing}
|
||||
The usage of NLP models for tasks such as sentiment analysis, topic classification, and entity recognition can be computationally intensive, especially for large datasets. To prevent the Flask API from blocking while these tasks are being processed, an asynchronous processing queue will be implemented using \textbf{Redis} and \textbf{Celery}.
|
||||
|
||||
The reasoning behind this architecture is that it allows the analytics to be aggregated and computed on the server side using Pandas which is much faster than doing it on the client frontend. The frontend will focus on rendering and visualising the data.
|
||||
When NLP processing is triggered or data is being fetched from social media APIs, a task will be added to the Redis queue. Celery workers will then pop tasks off the Redis queue and process these tasks in the background, which ensures the API to remain responsive to user requests. This approach also allows for better scalability, as additional workers can be added to handle increased load.
|
||||
|
||||
\subsubsection{Flask API}
|
||||
The Flask backend will expose a RESTful API with endpoints for dataset management, authentication and user management, and analytical queries. Flask will call on backend components for data parsing, normalisation, NLP processing and database interfacing.
|
||||
Some of the these tasks, like fetching data from social media APIs are very long-running tasks that can take hours to complete. By using asynchronous processing that updates the database with progress updates, users can see the status of their data fetching through the frontend.
|
||||
|
||||
Flask was chosen for its simplicity, familiarity and speed of development. It also has many extensions that can be used for authentication (Flask-Bcrypt, Flask-Login).
|
||||
|
||||
\subsubsection{React Frontend}
|
||||
React was chosen for the frontend due to its massive library of pre-built components with efficient rendering capabilities and ability to display many different types of data. The frontend will be structured around a tabbed interface, with each tab corresponding to a different analytical endpoint (e.g., temporal analysis, linguistic analysis, emotional analysis). Each tab will fetch data from the backend API and render it using appropriate visualisation libraries (react-wordcloud for word clouds, react-chartjs-2 for charts, etc). The frontend will also include controls for filtering the dataset based on keywords, date ranges, and data sources.
|
||||
|
||||
|
||||
\subsection{Database vs On-Disk Storage}
|
||||
\subsection{Design Tradeoffs}
|
||||
\subsubsection{Database vs On-Disk Storage}
|
||||
Originally, the system was designed to store \texttt{json} datasets on disk and load them into memory for processing. This was simple and time-efficient for early development and testing. However, as the functionality of the system expanded, it become clear that a more persistent and scalable storage solution was needed.
|
||||
|
||||
Storing datasets in a database allows for more efficient querying, filtering, and updating of data without needing to reload entire datasets into memory. However the priamry benefit of using a database is support for \textbf{ multiple users and multiple datasets per user}.
|
||||
@@ -556,27 +694,21 @@ An additional benefit of using a database was that it allowed the NLP processing
|
||||
|
||||
\texttt{PostgreSQL} was chosen as the database solution due to its robustness, support for complex queries, and compatibility with Python through \texttt{psycopg2}. PostgreSQL's support for JSONB fields allows for storage of unstructured NLP outputs, which alternatives like SQLite does not support.
|
||||
|
||||
\subsection{Asynchronous Processing}
|
||||
The usage of NLP models for tasks such as sentiment analysis, topic classification, and entity recognition can be computationally intensive, especially for large datasets. To prevent the Flask API from blocking while these tasks are being processed, an asynchronous processing queue will be implemented using \textbf{Redis} and \textbf{Celery}.
|
||||
\subsubsection{Unified Data Model vs Split Data Model}
|
||||
|
||||
When NLP processing is triggered or data is being fetched from social media APIs, a task will be added to the Redis queue. Celery workers will then pop tasks off the Redis queue and process these tasks in the background, which ensures the API to remain responsive to user requests. This approach also allows for better scalability, as additional workers can be added to handle increased load.
|
||||
|
||||
Some of the these tasks, like fetching data from social media APIs are very long-running tasks that can take hours to complete. By using asynchronous processing that updates the database with progress updates, users can see the status of their data fetching through the frontend.
|
||||
|
||||
\subsection{Docker Deployment}
|
||||
Docker Compose will be used to containerise the entire application, including:
|
||||
\subsection{Deployment}
|
||||
Docker Compose is used to containerise the entire application, including:
|
||||
\begin{itemize}
|
||||
\item The Flask backend API
|
||||
\item The React frontend interface
|
||||
\item The PostgreSQL database
|
||||
\item The Redis server for task queuing
|
||||
\item Celery workers for asynchronous processing
|
||||
\item NLP model caching and management
|
||||
\end{itemize}
|
||||
|
||||
In addition, the source code for the backend and frontend will be mounted as volumes within the containers to allow for live code updates during development, which will speed up the process.
|
||||
During development, the source code for the backend and frontend will be mounted as volumes within the containers to allow for live code updates during development, which will speed up the process.
|
||||
|
||||
Enviornment variables, such as database credentials and social media API keys, will be managed through an \texttt{.env} file that is passed into the Docker containers through \texttt{docker-compose.yml}.
|
||||
Enviornment variables, such as database credentials and social media API keys, will be managed through an \texttt{.env} file that is passed into the Docker containers through \texttt{docker-compose.yaml}.
|
||||
|
||||
|
||||
\newpage
|
||||
|
||||
@@ -67,6 +67,12 @@ class CulturalAnalysis:
|
||||
|
||||
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
|
||||
s = df[self.content_col].fillna("").astype(str)
|
||||
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||
emotion_cols = [
|
||||
c
|
||||
for c in df.columns
|
||||
if c.startswith("emotion_") and c not in emotion_exclusions
|
||||
]
|
||||
|
||||
hedge_pattern = re.compile(
|
||||
r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
|
||||
@@ -88,7 +94,7 @@ class CulturalAnalysis:
|
||||
0, 1
|
||||
)
|
||||
|
||||
return {
|
||||
result = {
|
||||
"hedge_total": int(hedge_counts.sum()),
|
||||
"certainty_total": int(certainty_counts.sum()),
|
||||
"deontic_total": int(deontic_counts.sum()),
|
||||
@@ -107,6 +113,32 @@ class CulturalAnalysis:
|
||||
),
|
||||
}
|
||||
|
||||
if emotion_cols:
|
||||
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
||||
|
||||
result["hedge_emotion_avg"] = (
|
||||
emo.loc[hedge_counts > 0].mean()
|
||||
if (hedge_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["certainty_emotion_avg"] = (
|
||||
emo.loc[certainty_counts > 0].mean()
|
||||
if (certainty_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["deontic_emotion_avg"] = (
|
||||
emo.loc[deontic_counts > 0].mean()
|
||||
if (deontic_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["permission_emotion_avg"] = (
|
||||
emo.loc[perm_counts > 0].mean()
|
||||
if (perm_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
|
||||
return result
|
||||
|
||||
def get_avg_emotions_per_entity(
|
||||
self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
|
||||
) -> dict[str, Any]:
|
||||
|
||||
@@ -71,6 +71,7 @@ class UserAnalysis:
|
||||
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
|
||||
|
||||
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
|
||||
dominant_topic_by_author = {}
|
||||
|
||||
avg_emotions_by_author = {}
|
||||
if emotion_cols:
|
||||
@@ -80,6 +81,31 @@ class UserAnalysis:
|
||||
for author, row in avg_emotions.iterrows()
|
||||
}
|
||||
|
||||
if "topic" in df.columns:
|
||||
topic_df = df[
|
||||
df["topic"].notna()
|
||||
& (df["topic"] != "")
|
||||
& (df["topic"] != "Misc")
|
||||
]
|
||||
if not topic_df.empty:
|
||||
topic_counts = (
|
||||
topic_df.groupby(["author", "topic"])
|
||||
.size()
|
||||
.reset_index(name="count")
|
||||
.sort_values(
|
||||
["author", "count", "topic"],
|
||||
ascending=[True, False, True],
|
||||
)
|
||||
.drop_duplicates(subset=["author"])
|
||||
)
|
||||
dominant_topic_by_author = {
|
||||
row["author"]: {
|
||||
"topic": row["topic"],
|
||||
"count": int(row["count"]),
|
||||
}
|
||||
for _, row in topic_counts.iterrows()
|
||||
}
|
||||
|
||||
# ensure columns always exist
|
||||
for col in ("post", "comment"):
|
||||
if col not in per_user.columns:
|
||||
@@ -109,6 +135,7 @@ class UserAnalysis:
|
||||
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
|
||||
"comment_share": float(row.get("comment_share", 0)),
|
||||
"avg_emotions": avg_emotions_by_author.get(author, {}),
|
||||
"dominant_topic": dominant_topic_by_author.get(author),
|
||||
"vocab": vocab_by_author.get(
|
||||
author,
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user