#!/usr/bin/env node /** * Тянет внешние RSS-фиды из src/data/feeds.json и записывает агрегированный * news.json в DATA_DIR (по умолчанию ./data). Запускается по cron на хосте. * * Использование: * node scripts/pull-external-rss.mjs # пишет в ./data/news.json * DATA_DIR=/abs/path node scripts/pull-external-rss.mjs */ import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { XMLParser } from 'fast-xml-parser'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const ROOT = path.resolve(__dirname, '..'); const FEEDS_FILE = path.join(ROOT, 'src', 'data', 'feeds.json'); const DATA_DIR = process.env.DATA_DIR || path.join(ROOT, 'data'); const OUT_FILE = path.join(DATA_DIR, 'news.json'); const TIMEOUT_MS = 15000; const HARD_CAP = 200; const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', textNodeName: '#text', }); async function fetchFeed(url, timeoutMs) { const ctl = new AbortController(); const t = setTimeout(() => ctl.abort(), timeoutMs); try { const r = await fetch(url, { signal: ctl.signal, headers: { 'User-Agent': 'pushkinohistory-ru-v2 RSS aggregator' }, }); if (!r.ok) throw new Error(`HTTP ${r.status}`); return await r.text(); } finally { clearTimeout(t); } } function stripHtml(s) { if (!s) return ''; return String(s).replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim().slice(0, 400); } function extractItems(xml, feed) { const parsed = parser.parse(xml); const rssItems = parsed?.rss?.channel?.item; if (rssItems) { const arr = Array.isArray(rssItems) ? rssItems : [rssItems]; return arr.map((it) => ({ title: typeof it.title === 'string' ? it.title : it.title?.['#text'] || '', link: typeof it.link === 'string' ? it.link : it.link?.['#text'] || '', guid: typeof it.guid === 'string' ? it.guid : it.guid?.['#text'] || it.link || '', pubDate: it.pubDate ? new Date(it.pubDate).toISOString() : null, description: stripHtml(it.description || it['content:encoded'] || ''), source: feed.name, })); } const atomEntries = parsed?.feed?.entry; if (atomEntries) { const arr = Array.isArray(atomEntries) ? atomEntries : [atomEntries]; return arr.map((e) => { const link = Array.isArray(e.link) ? e.link[0]?.['@_href'] : e.link?.['@_href'] || e.link; return { title: typeof e.title === 'string' ? e.title : e.title?.['#text'] || '', link: link || '', guid: e.id || link || '', pubDate: e.updated || e.published ? new Date(e.updated || e.published).toISOString() : null, description: stripHtml(e.summary?.['#text'] || e.summary || e.content?.['#text'] || ''), source: feed.name, }; }); } return []; } async function main() { const feeds = JSON.parse(fs.readFileSync(FEEDS_FILE, 'utf8')).filter((f) => f.enabled); if (feeds.length === 0) { console.log('no enabled feeds — writing empty news.json'); fs.mkdirSync(DATA_DIR, { recursive: true }); fs.writeFileSync(OUT_FILE, JSON.stringify({ updatedAt: new Date().toISOString(), items: [] }, null, 2)); return; } const all = []; for (const feed of feeds) { try { const xml = await fetchFeed(feed.url, TIMEOUT_MS); const items = extractItems(xml, feed); const max = feed.max || 20; all.push(...items.slice(0, max)); console.log(`OK ${feed.name}: ${items.length} (kept ${Math.min(items.length, max)})`); } catch (e) { console.warn(`FAIL ${feed.name}: ${e.message}`); } } const seen = new Set(); const deduped = []; for (const it of all) { const key = it.guid || it.link; if (!key || seen.has(key)) continue; seen.add(key); deduped.push(it); } deduped.sort((a, b) => (b.pubDate || '').localeCompare(a.pubDate || '')); const out = { updatedAt: new Date().toISOString(), items: deduped.slice(0, HARD_CAP) }; fs.mkdirSync(DATA_DIR, { recursive: true }); fs.writeFileSync(OUT_FILE, JSON.stringify(out, null, 2)); console.log(`-> ${OUT_FILE}: ${out.items.length} items`); } main().catch((e) => { console.error(e); process.exit(1); });