Compare commits
3 Commits
6378015726
...
37d08c63b8
| Author | SHA1 | Date | |
|---|---|---|---|
| 37d08c63b8 | |||
| 1482e96051 | |||
| cd6030a760 |
@@ -5,7 +5,7 @@ import DatasetsPage from "./pages/Datasets";
|
|||||||
import DatasetStatusPage from "./pages/DatasetStatus";
|
import DatasetStatusPage from "./pages/DatasetStatus";
|
||||||
import LoginPage from "./pages/Login";
|
import LoginPage from "./pages/Login";
|
||||||
import UploadPage from "./pages/Upload";
|
import UploadPage from "./pages/Upload";
|
||||||
import AutoScrapePage from "./pages/AutoScrape";
|
import AutoFetchPage from "./pages/AutoFetch";
|
||||||
import StatPage from "./pages/Stats";
|
import StatPage from "./pages/Stats";
|
||||||
import { getDocumentTitle } from "./utils/documentTitle";
|
import { getDocumentTitle } from "./utils/documentTitle";
|
||||||
import DatasetEditPage from "./pages/DatasetEdit";
|
import DatasetEditPage from "./pages/DatasetEdit";
|
||||||
@@ -23,7 +23,7 @@ function App() {
|
|||||||
<Route path="/" element={<Navigate to="/login" replace />} />
|
<Route path="/" element={<Navigate to="/login" replace />} />
|
||||||
<Route path="/login" element={<LoginPage />} />
|
<Route path="/login" element={<LoginPage />} />
|
||||||
<Route path="/upload" element={<UploadPage />} />
|
<Route path="/upload" element={<UploadPage />} />
|
||||||
<Route path="/auto-scrape" element={<AutoScrapePage />} />
|
<Route path="/auto-fetch" element={<AutoFetchPage />} />
|
||||||
<Route path="/datasets" element={<DatasetsPage />} />
|
<Route path="/datasets" element={<DatasetsPage />} />
|
||||||
<Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
|
<Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
|
||||||
<Route path="/dataset/:datasetId/stats" element={<StatPage />} />
|
<Route path="/dataset/:datasetId/stats" element={<StatPage />} />
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ const supportsSearch = (source?: SourceOption): boolean =>
|
|||||||
const supportsCategories = (source?: SourceOption): boolean =>
|
const supportsCategories = (source?: SourceOption): boolean =>
|
||||||
Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
|
Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
|
||||||
|
|
||||||
const AutoScrapePage = () => {
|
const AutoFetchPage = () => {
|
||||||
const navigate = useNavigate();
|
const navigate = useNavigate();
|
||||||
const [datasetName, setDatasetName] = useState("");
|
const [datasetName, setDatasetName] = useState("");
|
||||||
const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
|
const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
|
||||||
@@ -106,11 +106,11 @@ const AutoScrapePage = () => {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
const autoScrape = async () => {
|
const autoFetch = async () => {
|
||||||
const token = localStorage.getItem("access_token");
|
const token = localStorage.getItem("access_token");
|
||||||
if (!token) {
|
if (!token) {
|
||||||
setHasError(true);
|
setHasError(true);
|
||||||
setReturnMessage("You must be signed in to auto scrape a dataset.");
|
setReturnMessage("You must be signed in to auto fetch a dataset.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,7 +243,7 @@ const AutoScrapePage = () => {
|
|||||||
setReturnMessage("");
|
setReturnMessage("");
|
||||||
|
|
||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
`${API_BASE_URL}/datasets/scrape`,
|
`${API_BASE_URL}/datasets/fetch`,
|
||||||
requestBody,
|
requestBody,
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
@@ -255,7 +255,7 @@ const AutoScrapePage = () => {
|
|||||||
const datasetId = Number(response.data.dataset_id);
|
const datasetId = Number(response.data.dataset_id);
|
||||||
|
|
||||||
setReturnMessage(
|
setReturnMessage(
|
||||||
`Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
|
`Auto fetch queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
|
||||||
);
|
);
|
||||||
|
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
@@ -267,11 +267,11 @@ const AutoScrapePage = () => {
|
|||||||
const message = String(
|
const message = String(
|
||||||
requestError.response?.data?.error ||
|
requestError.response?.data?.error ||
|
||||||
requestError.message ||
|
requestError.message ||
|
||||||
"Auto scrape failed.",
|
"Auto fetch failed.",
|
||||||
);
|
);
|
||||||
setReturnMessage(`Auto scrape failed: ${message}`);
|
setReturnMessage(`Auto fetch failed: ${message}`);
|
||||||
} else {
|
} else {
|
||||||
setReturnMessage("Auto scrape failed due to an unexpected error.");
|
setReturnMessage("Auto fetch failed due to an unexpected error.");
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
setIsSubmitting(false);
|
setIsSubmitting(false);
|
||||||
@@ -283,9 +283,9 @@ const AutoScrapePage = () => {
|
|||||||
<div style={styles.containerWide}>
|
<div style={styles.containerWide}>
|
||||||
<div style={{ ...styles.card, ...styles.headerBar }}>
|
<div style={{ ...styles.card, ...styles.headerBar }}>
|
||||||
<div>
|
<div>
|
||||||
<h1 style={styles.sectionHeaderTitle}>Auto Scrape Dataset</h1>
|
<h1 style={styles.sectionHeaderTitle}>Auto Fetch Dataset</h1>
|
||||||
<p style={styles.sectionHeaderSubtitle}>
|
<p style={styles.sectionHeaderSubtitle}>
|
||||||
Select sources and scrape settings, then queue processing
|
Select sources and fetch settings, then queue processing
|
||||||
automatically.
|
automatically.
|
||||||
</p>
|
</p>
|
||||||
<p
|
<p
|
||||||
@@ -295,7 +295,7 @@ const AutoScrapePage = () => {
|
|||||||
color: "#9a6700",
|
color: "#9a6700",
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
Warning: Scraping more than 250 posts from any single site can
|
Warning: Fetching more than 250 posts from any single site can
|
||||||
take hours due to rate limits.
|
take hours due to rate limits.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
@@ -305,10 +305,10 @@ const AutoScrapePage = () => {
|
|||||||
...styles.buttonPrimary,
|
...styles.buttonPrimary,
|
||||||
opacity: isSubmitting || isLoadingSources ? 0.75 : 1,
|
opacity: isSubmitting || isLoadingSources ? 0.75 : 1,
|
||||||
}}
|
}}
|
||||||
onClick={autoScrape}
|
onClick={autoFetch}
|
||||||
disabled={isSubmitting || isLoadingSources}
|
disabled={isSubmitting || isLoadingSources}
|
||||||
>
|
>
|
||||||
{isSubmitting ? "Queueing..." : "Auto Scrape and Analyze"}
|
{isSubmitting ? "Queueing..." : "Auto Fetch and Analyze"}
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -527,4 +527,4 @@ const AutoScrapePage = () => {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
export default AutoScrapePage;
|
export default AutoFetchPage;
|
||||||
@@ -108,9 +108,9 @@ const DatasetsPage = () => {
|
|||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
style={styles.buttonSecondary}
|
style={styles.buttonSecondary}
|
||||||
onClick={() => navigate("/auto-scrape")}
|
onClick={() => navigate("/auto-fetch")}
|
||||||
>
|
>
|
||||||
Auto Scrape Dataset
|
Auto Fetch Dataset
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -66,45 +66,88 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
|
|||||||
error: "",
|
error: "",
|
||||||
};
|
};
|
||||||
|
|
||||||
const getExplorerRecordIdentity = (record: DatasetRecord) =>
|
const parseJsonLikePayload = (value: string): unknown => {
|
||||||
JSON.stringify({
|
const normalized = value
|
||||||
post_id: record.post_id ?? null,
|
.replace(/\uFEFF/g, "")
|
||||||
parent_id: record.parent_id ?? null,
|
.replace(/,\s*([}\]])/g, "$1")
|
||||||
reply_to: record.reply_to ?? null,
|
.replace(/(:\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
|
||||||
author: record.author ?? null,
|
.replace(/(\[\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
|
||||||
type: record.type ?? null,
|
.replace(/(,\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
|
||||||
timestamp: record.timestamp ?? null,
|
.replace(/(:\s*)None\b/g, "$1null")
|
||||||
dt: record.dt ?? null,
|
.replace(/(:\s*)True\b/g, "$1true")
|
||||||
title: record.title ?? null,
|
.replace(/(:\s*)False\b/g, "$1false")
|
||||||
content: record.content ?? null,
|
.replace(/(\[\s*)None\b/g, "$1null")
|
||||||
source: record.source ?? null,
|
.replace(/(\[\s*)True\b/g, "$1true")
|
||||||
topic: record.topic ?? null,
|
.replace(/(\[\s*)False\b/g, "$1false")
|
||||||
});
|
.replace(/(,\s*)None\b/g, "$1null")
|
||||||
|
.replace(/(,\s*)True\b/g, "$1true")
|
||||||
|
.replace(/(,\s*)False\b/g, "$1false");
|
||||||
|
|
||||||
const dedupeExplorerRecords = (records: DatasetRecord[]) => {
|
return JSON.parse(normalized);
|
||||||
const uniqueRecords: DatasetRecord[] = [];
|
};
|
||||||
const seen = new Set<string>();
|
|
||||||
|
|
||||||
for (const record of records) {
|
const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
|
||||||
const identity = getExplorerRecordIdentity(record);
|
const trimmed = payload.trim();
|
||||||
if (seen.has(identity)) {
|
if (!trimmed) {
|
||||||
continue;
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
seen.add(identity);
|
try {
|
||||||
uniqueRecords.push(record);
|
return normalizeRecordPayload(parseJsonLikePayload(trimmed));
|
||||||
|
} catch {
|
||||||
|
// Continue with additional fallback formats below.
|
||||||
}
|
}
|
||||||
|
|
||||||
return uniqueRecords;
|
const ndjsonLines = trimmed
|
||||||
|
.split(/\r?\n/)
|
||||||
|
.map((line) => line.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
if (ndjsonLines.length > 0) {
|
||||||
|
try {
|
||||||
|
return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
|
||||||
|
} catch {
|
||||||
|
// Continue with wrapped JSON extraction.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const bracketStart = trimmed.indexOf("[");
|
||||||
|
const bracketEnd = trimmed.lastIndexOf("]");
|
||||||
|
if (bracketStart !== -1 && bracketEnd > bracketStart) {
|
||||||
|
const candidate = trimmed.slice(bracketStart, bracketEnd + 1);
|
||||||
|
try {
|
||||||
|
return normalizeRecordPayload(parseJsonLikePayload(candidate));
|
||||||
|
} catch {
|
||||||
|
// Continue with object extraction.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const braceStart = trimmed.indexOf("{");
|
||||||
|
const braceEnd = trimmed.lastIndexOf("}");
|
||||||
|
if (braceStart !== -1 && braceEnd > braceStart) {
|
||||||
|
const candidate = trimmed.slice(braceStart, braceEnd + 1);
|
||||||
|
try {
|
||||||
|
return normalizeRecordPayload(parseJsonLikePayload(candidate));
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
};
|
};
|
||||||
|
|
||||||
const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
|
const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
|
||||||
if (typeof payload === "string") {
|
if (typeof payload === "string") {
|
||||||
try {
|
const parsed = parseRecordStringPayload(payload);
|
||||||
return normalizeRecordPayload(JSON.parse(payload));
|
if (parsed) {
|
||||||
} catch {
|
return parsed;
|
||||||
throw new Error("Corpus endpoint returned a non-JSON string payload.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const preview = payload.trim().slice(0, 120).replace(/\s+/g, " ");
|
||||||
|
throw new Error(
|
||||||
|
`Corpus endpoint returned a non-JSON string payload.${
|
||||||
|
preview ? ` Response preview: ${preview}` : ""
|
||||||
|
}`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -265,9 +308,7 @@ const StatPage = () => {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
const normalizedRecords = dedupeExplorerRecords(
|
const normalizedRecords = normalizeRecordPayload(response.data);
|
||||||
normalizeRecordPayload(response.data),
|
|
||||||
);
|
|
||||||
|
|
||||||
setAllRecords(normalizedRecords);
|
setAllRecords(normalizedRecords);
|
||||||
setAllRecordsKey(filterKey);
|
setAllRecordsKey(filterKey);
|
||||||
@@ -288,9 +329,7 @@ const StatPage = () => {
|
|||||||
try {
|
try {
|
||||||
const records = await ensureFilteredRecords();
|
const records = await ensureFilteredRecords();
|
||||||
const context = buildExplorerContext(records);
|
const context = buildExplorerContext(records);
|
||||||
const matched = dedupeExplorerRecords(
|
const matched = records.filter((record) => spec.matcher(record, context));
|
||||||
records.filter((record) => spec.matcher(record, context)),
|
|
||||||
);
|
|
||||||
matched.sort((a, b) => {
|
matched.sort((a, b) => {
|
||||||
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
|
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
|
||||||
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
|
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View";
|
|||||||
const STATIC_TITLES: Record<string, string> = {
|
const STATIC_TITLES: Record<string, string> = {
|
||||||
"/login": "Sign In",
|
"/login": "Sign In",
|
||||||
"/upload": "Upload Dataset",
|
"/upload": "Upload Dataset",
|
||||||
"/auto-scrape": "Auto Scrape Dataset",
|
"/auto-fetch": "Auto Fetch Dataset",
|
||||||
"/datasets": "My Datasets",
|
"/datasets": "My Datasets",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,30 @@
|
|||||||
import pandas as pd
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from itertools import islice
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NGramConfig:
|
||||||
|
min_token_length: int = 3
|
||||||
|
min_count: int = 2
|
||||||
|
max_results: int = 100
|
||||||
|
|
||||||
|
|
||||||
class LinguisticAnalysis:
|
class LinguisticAnalysis:
|
||||||
def __init__(self, word_exclusions: set[str]):
|
def __init__(self, word_exclusions: set[str]):
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
self.ngram_config = NGramConfig()
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
|
||||||
return [t for t in tokens if t not in self.word_exclusions]
|
tokens = re.findall(pattern, text)
|
||||||
|
|
||||||
|
if include_exclusions:
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
return [token for token in tokens if token not in self.word_exclusions]
|
||||||
|
|
||||||
def _clean_text(self, text: str) -> str:
|
def _clean_text(self, text: str) -> str:
|
||||||
text = re.sub(r"http\S+", "", text) # remove URLs
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
||||||
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
|
|||||||
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
|
||||||
|
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
||||||
|
|
||||||
|
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
|
||||||
|
if any(token in self.word_exclusions for token in tokens):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(set(tokens)) == 1:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
||||||
texts = df["content"].dropna().astype(str).str.lower()
|
texts = self._content_texts(df)
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
for text in texts:
|
for text in texts:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
words.extend(self._tokenize(text))
|
||||||
words.extend(w for w in tokens if w not in self.word_exclusions)
|
|
||||||
|
|
||||||
counts = Counter(words)
|
counts = Counter(words)
|
||||||
|
|
||||||
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
|
|||||||
|
|
||||||
return word_frequencies.to_dict(orient="records")
|
return word_frequencies.to_dict(orient="records")
|
||||||
|
|
||||||
def ngrams(self, df: pd.DataFrame, n=2, limit=100):
|
def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
|
||||||
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
if n < 2:
|
||||||
|
raise ValueError("n must be at least 2")
|
||||||
|
|
||||||
|
texts = self._content_texts(df)
|
||||||
all_ngrams = []
|
all_ngrams = []
|
||||||
|
result_limit = limit or self.ngram_config.max_results
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = self._tokenize(text, include_exclusions=True)
|
||||||
|
|
||||||
# stop word removal causes strange behaviors in ngrams
|
if len(tokens) < n:
|
||||||
# tokens = [w for w in tokens if w not in self.word_exclusions]
|
continue
|
||||||
|
|
||||||
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
|
for index in range(len(tokens) - n + 1):
|
||||||
all_ngrams.extend([" ".join(ng) for ng in ngrams])
|
ngram_tokens = tuple(tokens[index : index + n])
|
||||||
|
if self._valid_ngram(ngram_tokens):
|
||||||
|
all_ngrams.append(" ".join(ngram_tokens))
|
||||||
|
|
||||||
counts = Counter(all_ngrams)
|
counts = Counter(all_ngrams)
|
||||||
|
filtered_counts = [
|
||||||
|
(ngram, count)
|
||||||
|
for ngram, count in counts.items()
|
||||||
|
if count >= self.ngram_config.min_count
|
||||||
|
]
|
||||||
|
|
||||||
|
if not filtered_counts:
|
||||||
|
return []
|
||||||
|
|
||||||
return (
|
return (
|
||||||
pd.DataFrame(counts.items(), columns=["ngram", "count"])
|
pd.DataFrame(filtered_counts, columns=["ngram", "count"])
|
||||||
.sort_values("count", ascending=False)
|
.sort_values(["count", "ngram"], ascending=[False, True])
|
||||||
.head(limit)
|
.head(result_limit)
|
||||||
.to_dict(orient="records")
|
.to_dict(orient="records")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -89,39 +89,17 @@ class StatGen:
|
|||||||
df.to_json(orient="records", date_format="iso", date_unit="s")
|
df.to_json(orient="records", date_format="iso", date_unit="s")
|
||||||
)
|
)
|
||||||
|
|
||||||
def _dedupe_records(self, records: list[dict]) -> list[dict]:
|
|
||||||
unique_records = []
|
|
||||||
seen = set()
|
|
||||||
|
|
||||||
for record in records:
|
|
||||||
key_data = {
|
|
||||||
"post_id": record.get("post_id"),
|
|
||||||
"parent_id": record.get("parent_id"),
|
|
||||||
"reply_to": record.get("reply_to"),
|
|
||||||
"author": record.get("author"),
|
|
||||||
"type": record.get("type"),
|
|
||||||
"timestamp": record.get("timestamp"),
|
|
||||||
"dt": record.get("dt"),
|
|
||||||
"title": record.get("title"),
|
|
||||||
"content": record.get("content"),
|
|
||||||
"source": record.get("source"),
|
|
||||||
"topic": record.get("topic"),
|
|
||||||
}
|
|
||||||
key = json.dumps(key_data, sort_keys=True, separators=(",", ":"))
|
|
||||||
if key in seen:
|
|
||||||
continue
|
|
||||||
|
|
||||||
seen.add(key)
|
|
||||||
unique_records.append(record)
|
|
||||||
|
|
||||||
return unique_records
|
|
||||||
|
|
||||||
## Public Methods
|
## Public Methods
|
||||||
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
|
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
return self._dedupe_records(self._json_ready_records(filtered_df))
|
return self._json_ready_records(filtered_df)
|
||||||
|
|
||||||
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def temporal(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -129,7 +107,12 @@ class StatGen:
|
|||||||
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
|
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
|
||||||
}
|
}
|
||||||
|
|
||||||
def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def linguistic(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -139,7 +122,12 @@ class StatGen:
|
|||||||
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
|
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def emotional(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -149,7 +137,12 @@ class StatGen:
|
|||||||
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
|
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def user(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -157,7 +150,12 @@ class StatGen:
|
|||||||
"users": self.user_analysis.per_user_analysis(filtered_df)
|
"users": self.user_analysis.per_user_analysis(filtered_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def interactional(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -166,7 +164,12 @@ class StatGen:
|
|||||||
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
|
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def cultural(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -175,7 +178,12 @@ class StatGen:
|
|||||||
"avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
|
"avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
def summary(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
filters: dict | None = None,
|
||||||
|
dataset_id: int | None = None,
|
||||||
|
) -> dict:
|
||||||
filtered_df = self._prepare_filtered_df(df, filters)
|
filtered_df = self._prepare_filtered_df(df, filters)
|
||||||
|
|
||||||
return self.summary_analysis.summary(filtered_df)
|
return self.summary_analysis.summary(filtered_df)
|
||||||
|
|||||||
@@ -152,9 +152,9 @@ def get_dataset_sources():
|
|||||||
return jsonify(list_metadata)
|
return jsonify(list_metadata)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/datasets/scrape", methods=["POST"])
|
@app.route("/datasets/fetch", methods=["POST"])
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
def scrape_data():
|
def fetch_data():
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
connector_metadata = get_connector_metadata()
|
connector_metadata = get_connector_metadata()
|
||||||
|
|
||||||
@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
|
return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.emotional(dataset_content, filters)), 200
|
return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.summary(dataset_content, filters)), 200
|
return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.temporal(dataset_content, filters)), 200
|
return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.user(dataset_content, filters)), 200
|
return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.cultural(dataset_content, filters)), 200
|
return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.interactional(dataset_content, filters)), 200
|
return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from server.connectors.base import BaseConnector
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"}
|
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumFetcher/1.0)"}
|
||||||
|
|
||||||
|
|
||||||
class BoardsAPI(BaseConnector):
|
class BoardsAPI(BaseConnector):
|
||||||
|
|||||||
@@ -26,7 +26,34 @@ class DatasetManager:
|
|||||||
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
|
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
|
||||||
query = "SELECT * FROM events WHERE dataset_id = %s"
|
query = "SELECT * FROM events WHERE dataset_id = %s"
|
||||||
result = self.db.execute(query, (dataset_id,), fetch=True)
|
result = self.db.execute(query, (dataset_id,), fetch=True)
|
||||||
return pd.DataFrame(result)
|
df = pd.DataFrame(result)
|
||||||
|
if df.empty:
|
||||||
|
return df
|
||||||
|
|
||||||
|
dedupe_columns = [
|
||||||
|
column
|
||||||
|
for column in [
|
||||||
|
"post_id",
|
||||||
|
"parent_id",
|
||||||
|
"reply_to",
|
||||||
|
"author",
|
||||||
|
"type",
|
||||||
|
"timestamp",
|
||||||
|
"dt",
|
||||||
|
"title",
|
||||||
|
"content",
|
||||||
|
"source",
|
||||||
|
"topic",
|
||||||
|
]
|
||||||
|
if column in df.columns
|
||||||
|
]
|
||||||
|
|
||||||
|
if dedupe_columns:
|
||||||
|
df = df.drop_duplicates(subset=dedupe_columns, keep="first")
|
||||||
|
else:
|
||||||
|
df = df.drop_duplicates(keep="first")
|
||||||
|
|
||||||
|
return df.reset_index(drop=True)
|
||||||
|
|
||||||
def get_dataset_info(self, dataset_id: int) -> dict:
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
||||||
query = "SELECT * FROM datasets WHERE id = %s"
|
query = "SELECT * FROM datasets WHERE id = %s"
|
||||||
@@ -52,6 +79,16 @@ class DatasetManager:
|
|||||||
if event_data.empty:
|
if event_data.empty:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
dedupe_columns = [
|
||||||
|
column for column in ["id", "type", "source"] if column in event_data.columns
|
||||||
|
]
|
||||||
|
if dedupe_columns:
|
||||||
|
event_data = event_data.drop_duplicates(subset=dedupe_columns, keep="first")
|
||||||
|
else:
|
||||||
|
event_data = event_data.drop_duplicates(keep="first")
|
||||||
|
|
||||||
|
self.delete_dataset_content(dataset_id)
|
||||||
|
|
||||||
query = """
|
query = """
|
||||||
INSERT INTO events (
|
INSERT INTO events (
|
||||||
dataset_id,
|
dataset_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user