#!/usr/bin/env python3 """Convert WP posts-raw.jsonl → src/content/{posts,pages}.json with image URL rewrite + translit slugs.""" import json import re import sys from pathlib import Path from urllib.parse import unquote ROOT = Path(__file__).resolve().parent.parent RAW = ROOT / "scripts" / "posts-raw.jsonl" OUT_DIR = ROOT / "src" / "content" OUT_DIR.mkdir(parents=True, exist_ok=True) TRANSLIT = { 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'sch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya', } def slugify_ru(s: str) -> str: s = unquote(s).lower() out = [] for ch in s: if ch in TRANSLIT: out.append(TRANSLIT[ch]) elif ch.isalnum() or ch in '-_': out.append(ch) elif ch in ' \t': out.append('-') res = ''.join(out) res = re.sub(r'-+', '-', res).strip('-') return res or 'untitled' UPLOAD_RE = re.compile(r'https?://(?:www\.)?pushkinohistory\.ru/wp-content/uploads/[^/]+/[^/]+/([^"\'\s)]+)') def rewrite_uploads(html: str) -> str: return UPLOAD_RE.sub(r'/uploads/\1', html) CATEGORIES = { 20: [], 23: [], 73: [], 94: [], # pages — no category 137: ['main'], 139: ['main'], 142: ['main'], 145: ['main', 'today', 'tech'], 158: ['main', 'tech'], 226: ['main'], 235: ['main'], } CATEGORY_NAMES = { 'main': 'Главная', 'today': 'Настоящее', 'tech': 'Техническое', } def main() -> None: posts: list[dict] = [] pages: list[dict] = [] with RAW.open(encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue row = json.loads(line) old_slug = row['name'] new_slug = slugify_ru(old_slug if '%' in old_slug else old_slug) html = rewrite_uploads(row['content']) item = { 'id': row['id'], 'slug': new_slug, 'oldSlug': old_slug, 'title': row['title'], 'date': row['date'], 'excerpt': row.get('excerpt') or '', 'html': html, 'categories': [CATEGORY_NAMES[c] for c in CATEGORIES.get(row['id'], [])], 'categorySlugs': CATEGORIES.get(row['id'], []), } if row['type'] == 'post': posts.append(item) else: pages.append(item) posts.sort(key=lambda p: p['date'], reverse=True) pages.sort(key=lambda p: p['id']) (OUT_DIR / 'posts.json').write_text( json.dumps(posts, ensure_ascii=False, indent=2), encoding='utf-8' ) (OUT_DIR / 'pages.json').write_text( json.dumps(pages, ensure_ascii=False, indent=2), encoding='utf-8' ) print(f'posts: {len(posts)} → src/content/posts.json') print(f'pages: {len(pages)} → src/content/pages.json') for p in posts: print(f" post: /{p['slug']}/ (was: {p['oldSlug'][:40]}{'...' if len(p['oldSlug']) > 40 else ''}) — {p['title']}") for p in pages: print(f" page: /{p['slug']}/ — {p['title']}") if __name__ == '__main__': main()