#!/usr/bin/env python3
# Kleidos — i18n codegen.
#
# Reads i18n/strings.csv and emits:
#   src/i18n/Strings_gen.h   (StringId enum + gen::string accessor decl)
#   src/i18n/Strings_gen.cpp (packed, tail-merged string pool + offset table)
#
# Hooked from platformio.ini as `extra_scripts = pre:scripts/gen_i18n.py`.
# Stays a no-op when the generated files are already up-to-date.

import csv
import os
import sys
from pathlib import Path

# `__file__` may not be defined under SCons exec(); fall back to PROJECT_DIR.
try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    SCRIPT_DIR = Path(os.environ.get("PROJECT_DIR", os.getcwd())) / "scripts"
ROOT = SCRIPT_DIR.parent
CSV_PATH = ROOT / "i18n" / "strings.csv"
HDR_PATH = ROOT / "src" / "i18n" / "Strings_gen.h"
CPP_PATH = ROOT / "src" / "i18n" / "Strings_gen.cpp"

LANGS = ["en", "es", "fr", "de", "it"]


def cpp_escape(s: str) -> str:
    out = []
    for ch in s:
        if ch == "\\":
            out.append("\\\\")
        elif ch == '"':
            out.append('\\"')
        elif ch == "\n":
            out.append("\\n")
        elif ch == "\r":
            out.append("\\r")
        elif ch == "\t":
            out.append("\\t")
        else:
            out.append(ch)
    return "".join(out)


def parse_csv(path: Path):
    """Return list of (id, {lang: text}). Skips comment rows starting with '#'."""
    rows = []
    with path.open("r", encoding="utf-8", newline="") as f:
        reader = csv.reader(f)
        header = next(reader, None)
        if not header or header[0].lower() != "id":
            raise SystemExit(f"i18n: missing or unexpected header {header}")
        col = {h.lower(): i for i, h in enumerate(header)}
        # English is the fallback column, so it must exist before parsing rows.
        if "en" not in col:
            raise SystemExit("i18n: CSV must contain an 'en' column")
        for r in reader:
            if not r or not r[0].strip() or r[0].strip().startswith("#"):
                continue
            sid = r[0].strip()
            entry = {}
            for lang in LANGS:
                idx = col.get(lang)
                val = r[idx] if (idx is not None and idx < len(r)) else ""
                # Fall back to English when a translation is missing.
                entry[lang] = val if val.strip() else (entry.get("en") or r[col["en"]])
            rows.append((sid, entry))
    return rows


def render_header(rows):
    lines = []
    lines.append("// AUTO-GENERATED by scripts/gen_i18n.py — DO NOT EDIT MANUALLY.")
    lines.append("// Regenerated from i18n/strings.csv on every build.")
    lines.append("#pragma once")
    lines.append("#include <cstdint>")
    lines.append("")
    lines.append("namespace i18n {")
    lines.append("")
    lines.append("enum class StringId : uint16_t {")
    for sid, _ in rows:
        lines.append(f"    {sid},")
    lines.append("    COUNT")
    lines.append("};")
    lines.append("")
    lines.append("constexpr uint16_t kStringCount = static_cast<uint16_t>( StringId::COUNT );")
    lines.append("constexpr uint8_t  kLangCount   = 5;  // EN, ES, FR, DE, IT")
    lines.append("")
    lines.append("namespace gen {")
    lines.append("")
    lines.append("/**")
    lines.append(" * @brief Return translation (@p lang, @p index) as a NUL-terminated string.")
    lines.append(" *")
    lines.append(" * Points into the packed flash pool; the returned pointer stays valid for")
    lines.append(" * the program lifetime. No bounds checking — callers must ensure")
    lines.append(" * @p lang < @c kLangCount and @p index < @c kStringCount.")
    lines.append(" */")
    lines.append("const char* string( uint8_t lang, uint16_t index );")
    lines.append("")
    lines.append("}  // namespace gen")
    lines.append("")
    lines.append("}  // namespace i18n")
    lines.append("")
    return "\n".join(lines)


def build_pool(rows):
    """Build a tail-merged string pool.

    Returns (emit, offsets) where `emit` is the list of strings appended to the
    pool in order (each contributes its UTF-8 bytes + a NUL) and `offsets` maps
    every unique string to its byte offset into the concatenated blob.

    Strings are placed longest-first so that any string which is the suffix of
    another collapses onto the longer one's bytes (e.g. "ancel" reuses the tail
    of "Cancel"); identical strings are deduplicated outright.
    """
    uniq = sorted(
        {entry[lang] for _, entry in rows for lang in LANGS},
        key=lambda s: (-len(s.encode("utf-8")), s),
    )
    blob = bytearray()
    offsets = {}
    emit = []
    for s in uniq:
        needle = s.encode("utf-8") + b"\0"
        pos = bytes(blob).find(needle)
        if pos >= 0:
            offsets[s] = pos
        else:
            offsets[s] = len(blob)
            blob += needle
            emit.append(s)
    return emit, offsets, len(blob)


def render_cpp(rows):
    emit, offsets, blob_len = build_pool(rows)
    # +1 keeps the string literal's implicit terminator so the array size is not
    # one short of the initializer (which -Werror rejects).
    pool_size = blob_len + 1

    lines = []
    lines.append("// AUTO-GENERATED by scripts/gen_i18n.py — DO NOT EDIT MANUALLY.")
    lines.append('#include "Strings_gen.h"')
    lines.append("")
    lines.append("#include <array>")
    lines.append("#include <cstdint>")
    lines.append("")
    lines.append("namespace i18n {")
    lines.append("namespace gen {")
    lines.append("")
    lines.append("// Packed translation pool: all strings concatenated into one flash blob,")
    lines.append("// deduplicated and tail-merged (a string that is the suffix of another")
    lines.append("// shares its bytes). Adjacent string literals concatenate; the offsets")
    lines.append("// below index into the resulting bytes.")
    lines.append("// clang-format off")
    lines.append(f"constexpr std::array<char, {pool_size}> kPool = {{")
    running = 0
    for s in emit:
        lines.append(f'    "{cpp_escape(s)}\\0"  // @{running}')
        running += len(s.encode("utf-8")) + 1
    lines.append("};")
    lines.append("// clang-format on")
    lines.append("")
    lines.append("// Per-language byte offsets into kPool. uint16_t keeps this table half the")
    lines.append("// size of a 32-bit pointer table and free of load-time relocations.")
    lines.append("// clang-format off")
    lines.append(
        "constexpr std::array<std::array<uint16_t, kStringCount>, kLangCount> kOffsets = { {"
    )
    for lang in LANGS:
        offs = [offsets[entry[lang]] for _, entry in rows]
        lines.append(f"    /* {lang.upper()} */ {{ {{")
        for i in range(0, len(offs), 12):
            chunk = ", ".join(str(o) for o in offs[i : i + 12])
            lines.append(f"        {chunk},")
        lines.append("    } },")
    lines.append("} };")
    lines.append("// clang-format on")
    lines.append("")
    lines.append(
        'static_assert( kPool.size() <= UINT16_MAX, "pool exceeds uint16_t offset range" );'
    )
    lines.append("")
    lines.append("const char* string( uint8_t lang, uint16_t index ) {")
    lines.append("    return kPool.data() + kOffsets[lang][index];")
    lines.append("}")
    lines.append("")
    lines.append("}  // namespace gen")
    lines.append("}  // namespace i18n")
    lines.append("")
    return "\n".join(lines)


def write_if_changed(path: Path, content: str) -> bool:
    """Write only if content differs (avoids triggering recompilation)."""
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        old = path.read_text(encoding="utf-8")
        if old == content:
            return False
    path.write_text(content, encoding="utf-8")
    return True


def generate():
    if not CSV_PATH.exists():
        print(f"i18n: missing {CSV_PATH}", file=sys.stderr)
        return False
    rows = parse_csv(CSV_PATH)
    if not rows:
        print("i18n: CSV produced no rows", file=sys.stderr)
        return False
    h_changed = write_if_changed(HDR_PATH, render_header(rows))
    c_changed = write_if_changed(CPP_PATH, render_cpp(rows))
    if h_changed or c_changed:
        print(f"i18n: regenerated {len(rows)} ids x {len(LANGS)} langs")
    return True


# PlatformIO entry point.
try:
    Import("env")  # type: ignore[name-defined]  # noqa: F821
    generate()
except NameError:
    # Standalone CLI invocation.
    if __name__ == "__main__":
        generate()
