98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
||
"""Convert WP posts-raw.jsonl → src/content/{posts,pages}.json with image URL rewrite + translit slugs."""
|
||
import json
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
from urllib.parse import unquote
|
||
|
||
ROOT = Path(__file__).resolve().parent.parent
|
||
RAW = ROOT / "scripts" / "posts-raw.jsonl"
|
||
OUT_DIR = ROOT / "src" / "content"
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
TRANSLIT = {
|
||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo',
|
||
'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm',
|
||
'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
|
||
'ф': 'f', 'х': 'h', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'sch',
|
||
'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya',
|
||
}
|
||
|
||
def slugify_ru(s: str) -> str:
|
||
s = unquote(s).lower()
|
||
out = []
|
||
for ch in s:
|
||
if ch in TRANSLIT:
|
||
out.append(TRANSLIT[ch])
|
||
elif ch.isalnum() or ch in '-_':
|
||
out.append(ch)
|
||
elif ch in ' \t':
|
||
out.append('-')
|
||
res = ''.join(out)
|
||
res = re.sub(r'-+', '-', res).strip('-')
|
||
return res or 'untitled'
|
||
|
||
UPLOAD_RE = re.compile(r'https?://(?:www\.)?pushkinohistory\.ru/wp-content/uploads/[^/]+/[^/]+/([^"\'\s)]+)')
|
||
|
||
def rewrite_uploads(html: str) -> str:
|
||
return UPLOAD_RE.sub(r'/uploads/\1', html)
|
||
|
||
CATEGORIES = {
|
||
20: [], 23: [], 73: [], 94: [], # pages — no category
|
||
137: ['main'], 139: ['main'],
|
||
142: ['main'], 145: ['main', 'today', 'tech'],
|
||
158: ['main', 'tech'], 226: ['main'], 235: ['main'],
|
||
}
|
||
|
||
CATEGORY_NAMES = {
|
||
'main': 'Главная',
|
||
'today': 'Настоящее',
|
||
'tech': 'Техническое',
|
||
}
|
||
|
||
def main() -> None:
|
||
posts: list[dict] = []
|
||
pages: list[dict] = []
|
||
with RAW.open(encoding='utf-8') as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
row = json.loads(line)
|
||
old_slug = row['name']
|
||
new_slug = slugify_ru(old_slug if '%' in old_slug else old_slug)
|
||
html = rewrite_uploads(row['content'])
|
||
item = {
|
||
'id': row['id'],
|
||
'slug': new_slug,
|
||
'oldSlug': old_slug,
|
||
'title': row['title'],
|
||
'date': row['date'],
|
||
'excerpt': row.get('excerpt') or '',
|
||
'html': html,
|
||
'categories': [CATEGORY_NAMES[c] for c in CATEGORIES.get(row['id'], [])],
|
||
'categorySlugs': CATEGORIES.get(row['id'], []),
|
||
}
|
||
if row['type'] == 'post':
|
||
posts.append(item)
|
||
else:
|
||
pages.append(item)
|
||
posts.sort(key=lambda p: p['date'], reverse=True)
|
||
pages.sort(key=lambda p: p['id'])
|
||
(OUT_DIR / 'posts.json').write_text(
|
||
json.dumps(posts, ensure_ascii=False, indent=2), encoding='utf-8'
|
||
)
|
||
(OUT_DIR / 'pages.json').write_text(
|
||
json.dumps(pages, ensure_ascii=False, indent=2), encoding='utf-8'
|
||
)
|
||
print(f'posts: {len(posts)} → src/content/posts.json')
|
||
print(f'pages: {len(pages)} → src/content/pages.json')
|
||
for p in posts:
|
||
print(f" post: /{p['slug']}/ (was: {p['oldSlug'][:40]}{'...' if len(p['oldSlug']) > 40 else ''}) — {p['title']}")
|
||
for p in pages:
|
||
print(f" page: /{p['slug']}/ — {p['title']}")
|
||
|
||
if __name__ == '__main__':
|
||
main()
|