Compare commits

..

3 Commits

10 changed files with 242 additions and 120 deletions

View File

@@ -5,7 +5,7 @@ import DatasetsPage from "./pages/Datasets";
import DatasetStatusPage from "./pages/DatasetStatus"; import DatasetStatusPage from "./pages/DatasetStatus";
import LoginPage from "./pages/Login"; import LoginPage from "./pages/Login";
import UploadPage from "./pages/Upload"; import UploadPage from "./pages/Upload";
import AutoScrapePage from "./pages/AutoScrape"; import AutoFetchPage from "./pages/AutoFetch";
import StatPage from "./pages/Stats"; import StatPage from "./pages/Stats";
import { getDocumentTitle } from "./utils/documentTitle"; import { getDocumentTitle } from "./utils/documentTitle";
import DatasetEditPage from "./pages/DatasetEdit"; import DatasetEditPage from "./pages/DatasetEdit";
@@ -23,7 +23,7 @@ function App() {
<Route path="/" element={<Navigate to="/login" replace />} /> <Route path="/" element={<Navigate to="/login" replace />} />
<Route path="/login" element={<LoginPage />} /> <Route path="/login" element={<LoginPage />} />
<Route path="/upload" element={<UploadPage />} /> <Route path="/upload" element={<UploadPage />} />
<Route path="/auto-scrape" element={<AutoScrapePage />} /> <Route path="/auto-fetch" element={<AutoFetchPage />} />
<Route path="/datasets" element={<DatasetsPage />} /> <Route path="/datasets" element={<DatasetsPage />} />
<Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} /> <Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
<Route path="/dataset/:datasetId/stats" element={<StatPage />} /> <Route path="/dataset/:datasetId/stats" element={<StatPage />} />

View File

@@ -37,7 +37,7 @@ const supportsSearch = (source?: SourceOption): boolean =>
const supportsCategories = (source?: SourceOption): boolean => const supportsCategories = (source?: SourceOption): boolean =>
Boolean(source?.categories_enabled ?? source?.categoriesEnabled); Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
const AutoScrapePage = () => { const AutoFetchPage = () => {
const navigate = useNavigate(); const navigate = useNavigate();
const [datasetName, setDatasetName] = useState(""); const [datasetName, setDatasetName] = useState("");
const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]); const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
@@ -106,11 +106,11 @@ const AutoScrapePage = () => {
); );
}; };
const autoScrape = async () => { const autoFetch = async () => {
const token = localStorage.getItem("access_token"); const token = localStorage.getItem("access_token");
if (!token) { if (!token) {
setHasError(true); setHasError(true);
setReturnMessage("You must be signed in to auto scrape a dataset."); setReturnMessage("You must be signed in to auto fetch a dataset.");
return; return;
} }
@@ -243,7 +243,7 @@ const AutoScrapePage = () => {
setReturnMessage(""); setReturnMessage("");
const response = await axios.post( const response = await axios.post(
`${API_BASE_URL}/datasets/scrape`, `${API_BASE_URL}/datasets/fetch`,
requestBody, requestBody,
{ {
headers: { headers: {
@@ -255,7 +255,7 @@ const AutoScrapePage = () => {
const datasetId = Number(response.data.dataset_id); const datasetId = Number(response.data.dataset_id);
setReturnMessage( setReturnMessage(
`Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...`, `Auto fetch queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
); );
setTimeout(() => { setTimeout(() => {
@@ -267,11 +267,11 @@ const AutoScrapePage = () => {
const message = String( const message = String(
requestError.response?.data?.error || requestError.response?.data?.error ||
requestError.message || requestError.message ||
"Auto scrape failed.", "Auto fetch failed.",
); );
setReturnMessage(`Auto scrape failed: ${message}`); setReturnMessage(`Auto fetch failed: ${message}`);
} else { } else {
setReturnMessage("Auto scrape failed due to an unexpected error."); setReturnMessage("Auto fetch failed due to an unexpected error.");
} }
} finally { } finally {
setIsSubmitting(false); setIsSubmitting(false);
@@ -283,9 +283,9 @@ const AutoScrapePage = () => {
<div style={styles.containerWide}> <div style={styles.containerWide}>
<div style={{ ...styles.card, ...styles.headerBar }}> <div style={{ ...styles.card, ...styles.headerBar }}>
<div> <div>
<h1 style={styles.sectionHeaderTitle}>Auto Scrape Dataset</h1> <h1 style={styles.sectionHeaderTitle}>Auto Fetch Dataset</h1>
<p style={styles.sectionHeaderSubtitle}> <p style={styles.sectionHeaderSubtitle}>
Select sources and scrape settings, then queue processing Select sources and fetch settings, then queue processing
automatically. automatically.
</p> </p>
<p <p
@@ -295,7 +295,7 @@ const AutoScrapePage = () => {
color: "#9a6700", color: "#9a6700",
}} }}
> >
Warning: Scraping more than 250 posts from any single site can Warning: Fetching more than 250 posts from any single site can
take hours due to rate limits. take hours due to rate limits.
</p> </p>
</div> </div>
@@ -305,10 +305,10 @@ const AutoScrapePage = () => {
...styles.buttonPrimary, ...styles.buttonPrimary,
opacity: isSubmitting || isLoadingSources ? 0.75 : 1, opacity: isSubmitting || isLoadingSources ? 0.75 : 1,
}} }}
onClick={autoScrape} onClick={autoFetch}
disabled={isSubmitting || isLoadingSources} disabled={isSubmitting || isLoadingSources}
> >
{isSubmitting ? "Queueing..." : "Auto Scrape and Analyze"} {isSubmitting ? "Queueing..." : "Auto Fetch and Analyze"}
</button> </button>
</div> </div>
@@ -527,4 +527,4 @@ const AutoScrapePage = () => {
); );
}; };
export default AutoScrapePage; export default AutoFetchPage;

View File

@@ -108,9 +108,9 @@ const DatasetsPage = () => {
<button <button
type="button" type="button"
style={styles.buttonSecondary} style={styles.buttonSecondary}
onClick={() => navigate("/auto-scrape")} onClick={() => navigate("/auto-fetch")}
> >
Auto Scrape Dataset Auto Fetch Dataset
</button> </button>
</div> </div>
</div> </div>

View File

@@ -66,45 +66,88 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
error: "", error: "",
}; };
const getExplorerRecordIdentity = (record: DatasetRecord) => const parseJsonLikePayload = (value: string): unknown => {
JSON.stringify({ const normalized = value
post_id: record.post_id ?? null, .replace(/\uFEFF/g, "")
parent_id: record.parent_id ?? null, .replace(/,\s*([}\]])/g, "$1")
reply_to: record.reply_to ?? null, .replace(/(:\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
author: record.author ?? null, .replace(/(\[\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
type: record.type ?? null, .replace(/(,\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
timestamp: record.timestamp ?? null, .replace(/(:\s*)None\b/g, "$1null")
dt: record.dt ?? null, .replace(/(:\s*)True\b/g, "$1true")
title: record.title ?? null, .replace(/(:\s*)False\b/g, "$1false")
content: record.content ?? null, .replace(/(\[\s*)None\b/g, "$1null")
source: record.source ?? null, .replace(/(\[\s*)True\b/g, "$1true")
topic: record.topic ?? null, .replace(/(\[\s*)False\b/g, "$1false")
}); .replace(/(,\s*)None\b/g, "$1null")
.replace(/(,\s*)True\b/g, "$1true")
.replace(/(,\s*)False\b/g, "$1false");
const dedupeExplorerRecords = (records: DatasetRecord[]) => { return JSON.parse(normalized);
const uniqueRecords: DatasetRecord[] = []; };
const seen = new Set<string>();
for (const record of records) { const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
const identity = getExplorerRecordIdentity(record); const trimmed = payload.trim();
if (seen.has(identity)) { if (!trimmed) {
continue; return [];
}
seen.add(identity);
uniqueRecords.push(record);
} }
return uniqueRecords; try {
return normalizeRecordPayload(parseJsonLikePayload(trimmed));
} catch {
// Continue with additional fallback formats below.
}
const ndjsonLines = trimmed
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
if (ndjsonLines.length > 0) {
try {
return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
} catch {
// Continue with wrapped JSON extraction.
}
}
const bracketStart = trimmed.indexOf("[");
const bracketEnd = trimmed.lastIndexOf("]");
if (bracketStart !== -1 && bracketEnd > bracketStart) {
const candidate = trimmed.slice(bracketStart, bracketEnd + 1);
try {
return normalizeRecordPayload(parseJsonLikePayload(candidate));
} catch {
// Continue with object extraction.
}
}
const braceStart = trimmed.indexOf("{");
const braceEnd = trimmed.lastIndexOf("}");
if (braceStart !== -1 && braceEnd > braceStart) {
const candidate = trimmed.slice(braceStart, braceEnd + 1);
try {
return normalizeRecordPayload(parseJsonLikePayload(candidate));
} catch {
return null;
}
}
return null;
}; };
const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => { const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
if (typeof payload === "string") { if (typeof payload === "string") {
try { const parsed = parseRecordStringPayload(payload);
return normalizeRecordPayload(JSON.parse(payload)); if (parsed) {
} catch { return parsed;
throw new Error("Corpus endpoint returned a non-JSON string payload.");
} }
const preview = payload.trim().slice(0, 120).replace(/\s+/g, " ");
throw new Error(
`Corpus endpoint returned a non-JSON string payload.${
preview ? ` Response preview: ${preview}` : ""
}`,
);
} }
if ( if (
@@ -265,9 +308,7 @@ const StatPage = () => {
}, },
); );
const normalizedRecords = dedupeExplorerRecords( const normalizedRecords = normalizeRecordPayload(response.data);
normalizeRecordPayload(response.data),
);
setAllRecords(normalizedRecords); setAllRecords(normalizedRecords);
setAllRecordsKey(filterKey); setAllRecordsKey(filterKey);
@@ -288,9 +329,7 @@ const StatPage = () => {
try { try {
const records = await ensureFilteredRecords(); const records = await ensureFilteredRecords();
const context = buildExplorerContext(records); const context = buildExplorerContext(records);
const matched = dedupeExplorerRecords( const matched = records.filter((record) => spec.matcher(record, context));
records.filter((record) => spec.matcher(record, context)),
);
matched.sort((a, b) => { matched.sort((a, b) => {
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? ""); const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? ""); const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");

View File

@@ -3,7 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View";
const STATIC_TITLES: Record<string, string> = { const STATIC_TITLES: Record<string, string> = {
"/login": "Sign In", "/login": "Sign In",
"/upload": "Upload Dataset", "/upload": "Upload Dataset",
"/auto-scrape": "Auto Scrape Dataset", "/auto-fetch": "Auto Fetch Dataset",
"/datasets": "My Datasets", "/datasets": "My Datasets",
}; };

View File

@@ -1,17 +1,30 @@
import pandas as pd
import re import re
from collections import Counter from collections import Counter
from itertools import islice from dataclasses import dataclass
import pandas as pd
@dataclass(frozen=True)
class NGramConfig:
min_token_length: int = 3
min_count: int = 2
max_results: int = 100
class LinguisticAnalysis: class LinguisticAnalysis:
def __init__(self, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
self.ngram_config = NGramConfig()
def _tokenize(self, text: str): def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
tokens = re.findall(r"\b[a-z]{3,}\b", text) pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
return [t for t in tokens if t not in self.word_exclusions] tokens = re.findall(pattern, text)
if include_exclusions:
return tokens
return [token for token in tokens if token not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text return text
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
if any(token in self.word_exclusions for token in tokens):
return False
if len(set(tokens)) == 1:
return False
return True
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]: def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = df["content"].dropna().astype(str).str.lower() texts = self._content_texts(df)
words = [] words = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) words.extend(self._tokenize(text))
words.extend(w for w in tokens if w not in self.word_exclusions)
counts = Counter(words) counts = Counter(words)
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
return word_frequencies.to_dict(orient="records") return word_frequencies.to_dict(orient="records")
def ngrams(self, df: pd.DataFrame, n=2, limit=100): def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower() if n < 2:
raise ValueError("n must be at least 2")
texts = self._content_texts(df)
all_ngrams = [] all_ngrams = []
result_limit = limit or self.ngram_config.max_results
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = self._tokenize(text, include_exclusions=True)
# stop word removal causes strange behaviors in ngrams if len(tokens) < n:
# tokens = [w for w in tokens if w not in self.word_exclusions] continue
ngrams = zip(*(islice(tokens, i, None) for i in range(n))) for index in range(len(tokens) - n + 1):
all_ngrams.extend([" ".join(ng) for ng in ngrams]) ngram_tokens = tuple(tokens[index : index + n])
if self._valid_ngram(ngram_tokens):
all_ngrams.append(" ".join(ngram_tokens))
counts = Counter(all_ngrams) counts = Counter(all_ngrams)
filtered_counts = [
(ngram, count)
for ngram, count in counts.items()
if count >= self.ngram_config.min_count
]
if not filtered_counts:
return []
return ( return (
pd.DataFrame(counts.items(), columns=["ngram", "count"]) pd.DataFrame(filtered_counts, columns=["ngram", "count"])
.sort_values("count", ascending=False) .sort_values(["count", "ngram"], ascending=[False, True])
.head(limit) .head(result_limit)
.to_dict(orient="records") .to_dict(orient="records")
) )

View File

@@ -89,39 +89,17 @@ class StatGen:
df.to_json(orient="records", date_format="iso", date_unit="s") df.to_json(orient="records", date_format="iso", date_unit="s")
) )
def _dedupe_records(self, records: list[dict]) -> list[dict]:
unique_records = []
seen = set()
for record in records:
key_data = {
"post_id": record.get("post_id"),
"parent_id": record.get("parent_id"),
"reply_to": record.get("reply_to"),
"author": record.get("author"),
"type": record.get("type"),
"timestamp": record.get("timestamp"),
"dt": record.get("dt"),
"title": record.get("title"),
"content": record.get("content"),
"source": record.get("source"),
"topic": record.get("topic"),
}
key = json.dumps(key_data, sort_keys=True, separators=(",", ":"))
if key in seen:
continue
seen.add(key)
unique_records.append(record)
return unique_records
## Public Methods ## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return self._dedupe_records(self._json_ready_records(filtered_df)) return self._json_ready_records(filtered_df)
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def temporal(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -129,7 +107,12 @@ class StatGen:
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
} }
def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def linguistic(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -139,7 +122,12 @@ class StatGen:
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df) "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
} }
def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def emotional(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -149,7 +137,12 @@ class StatGen:
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df) "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
} }
def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def user(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -157,7 +150,12 @@ class StatGen:
"users": self.user_analysis.per_user_analysis(filtered_df) "users": self.user_analysis.per_user_analysis(filtered_df)
} }
def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def interactional(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -166,7 +164,12 @@ class StatGen:
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df) "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
} }
def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def cultural(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -175,7 +178,12 @@ class StatGen:
"avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df) "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
} }
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def summary(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return self.summary_analysis.summary(filtered_df) return self.summary_analysis.summary(filtered_df)

View File

@@ -152,9 +152,9 @@ def get_dataset_sources():
return jsonify(list_metadata) return jsonify(list_metadata)
@app.route("/datasets/scrape", methods=["POST"]) @app.route("/datasets/fetch", methods=["POST"])
@jwt_required() @jwt_required()
def scrape_data(): def fetch_data():
data = request.get_json() data = request.get_json()
connector_metadata = get_connector_metadata() connector_metadata = get_connector_metadata()
@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.emotional(dataset_content, filters)), 200 return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.summary(dataset_content, filters)), 200 return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.temporal(dataset_content, filters)), 200 return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.user(dataset_content, filters)), 200 return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.cultural(dataset_content, filters)), 200 return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.interactional(dataset_content, filters)), 200 return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:

View File

@@ -11,7 +11,7 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"} HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumFetcher/1.0)"}
class BoardsAPI(BaseConnector): class BoardsAPI(BaseConnector):

View File

@@ -26,7 +26,34 @@ class DatasetManager:
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame: def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
query = "SELECT * FROM events WHERE dataset_id = %s" query = "SELECT * FROM events WHERE dataset_id = %s"
result = self.db.execute(query, (dataset_id,), fetch=True) result = self.db.execute(query, (dataset_id,), fetch=True)
return pd.DataFrame(result) df = pd.DataFrame(result)
if df.empty:
return df
dedupe_columns = [
column
for column in [
"post_id",
"parent_id",
"reply_to",
"author",
"type",
"timestamp",
"dt",
"title",
"content",
"source",
"topic",
]
if column in df.columns
]
if dedupe_columns:
df = df.drop_duplicates(subset=dedupe_columns, keep="first")
else:
df = df.drop_duplicates(keep="first")
return df.reset_index(drop=True)
def get_dataset_info(self, dataset_id: int) -> dict: def get_dataset_info(self, dataset_id: int) -> dict:
query = "SELECT * FROM datasets WHERE id = %s" query = "SELECT * FROM datasets WHERE id = %s"
@@ -52,6 +79,16 @@ class DatasetManager:
if event_data.empty: if event_data.empty:
return return
dedupe_columns = [
column for column in ["id", "type", "source"] if column in event_data.columns
]
if dedupe_columns:
event_data = event_data.drop_duplicates(subset=dedupe_columns, keep="first")
else:
event_data = event_data.drop_duplicates(keep="first")
self.delete_dataset_content(dataset_id)
query = """ query = """
INSERT INTO events ( INSERT INTO events (
dataset_id, dataset_id,