pushkinohistory-ru-v2/scripts/convert_posts.py

#!/usr/bin/env python3
"""Convert WP posts-raw.jsonl → src/content/{posts,pages}.json with image URL rewrite + translit slugs."""
import json
import re
import sys
from pathlib import Path
from urllib.parse import unquote

ROOT = Path(__file__).resolve().parent.parent
RAW = ROOT / "scripts" / "posts-raw.jsonl"
OUT_DIR = ROOT / "src" / "content"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRANSLIT = {
    'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo',
    'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm',
    'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
    'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'sch',
    'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya',
}

def slugify_ru(s: str) -> str:
    s = unquote(s).lower()
    out = []
    for ch in s:
        if ch in TRANSLIT:
            out.append(TRANSLIT[ch])
        elif ch.isalnum() or ch in '-_':
            out.append(ch)
        elif ch in ' \t':
            out.append('-')
    res = ''.join(out)
    res = re.sub(r'-+', '-', res).strip('-')
    return res or 'untitled'

UPLOAD_RE = re.compile(r'https?://(?:www\.)?pushkinohistory\.ru/wp-content/uploads/[^/]+/[^/]+/([^"\'\s)]+)')
# WP-resized варианты: file-1024x768.png → file.png. У нас в /uploads/ лежит только оригинал.
RESIZED_RE = re.compile(r'(/uploads/[^"\'\s)]+?)-\d+x\d+(\.\w+)')

def rewrite_uploads(html: str) -> str:
    html = UPLOAD_RE.sub(r'/uploads/\1', html)
    html = RESIZED_RE.sub(r'\1\2', html)
    return html

CATEGORIES = {
    20: [], 23: [], 73: [], 94: [],  # pages — no category
    137: ['main'], 139: ['main'],
    142: ['main'], 145: ['main', 'today', 'tech'],
    158: ['main', 'tech'], 226: ['main'], 235: ['main'],
}

CATEGORY_NAMES = {
    'main': 'Главная',
    'today': 'Настоящее',
    'tech': 'Техническое',
}

def main() -> None:
    posts: list[dict] = []
    pages: list[dict] = []
    with RAW.open(encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            row = json.loads(line)
            old_slug = row['name']
            new_slug = slugify_ru(old_slug if '%' in old_slug else old_slug)
            html = rewrite_uploads(row['content'])
            item = {
                'id': row['id'],
                'slug': new_slug,
                'oldSlug': old_slug,
                'title': row['title'],
                'date': row['date'],
                'excerpt': row.get('excerpt') or '',
                'html': html,
                'categories': [CATEGORY_NAMES[c] for c in CATEGORIES.get(row['id'], [])],
                'categorySlugs': CATEGORIES.get(row['id'], []),
            }
            if row['type'] == 'post':
                posts.append(item)
            else:
                pages.append(item)
    posts.sort(key=lambda p: p['date'], reverse=True)
    pages.sort(key=lambda p: p['id'])
    (OUT_DIR / 'posts.json').write_text(
        json.dumps(posts, ensure_ascii=False, indent=2), encoding='utf-8'
    )
    (OUT_DIR / 'pages.json').write_text(
        json.dumps(pages, ensure_ascii=False, indent=2), encoding='utf-8'
    )
    print(f'posts: {len(posts)} → src/content/posts.json')
    print(f'pages: {len(pages)} → src/content/pages.json')
    for p in posts:
        print(f"  post: /{p['slug']}/ (was: {p['oldSlug'][:40]}{'...' if len(p['oldSlug']) > 40 else ''}) — {p['title']}")
    for p in pages:
        print(f"  page: /{p['slug']}/ — {p['title']}")

if __name__ == '__main__':
    main()