#!/usr/bin/env python3 # Kleidos — i18n codegen. # # Reads i18n/strings.csv and emits: # src/i18n/Strings_gen.h (StringId enum + gen::string accessor decl) # src/i18n/Strings_gen.cpp (packed, tail-merged string pool + offset table) # # Hooked from platformio.ini as `extra_scripts = pre:scripts/gen_i18n.py`. # Stays a no-op when the generated files are already up-to-date. import csv import os import sys from pathlib import Path # `__file__` may not be defined under SCons exec(); fall back to PROJECT_DIR. try: SCRIPT_DIR = Path(__file__).resolve().parent except NameError: SCRIPT_DIR = Path(os.environ.get("PROJECT_DIR", os.getcwd())) / "scripts" ROOT = SCRIPT_DIR.parent CSV_PATH = ROOT / "i18n" / "strings.csv" HDR_PATH = ROOT / "src" / "i18n" / "Strings_gen.h" CPP_PATH = ROOT / "src" / "i18n" / "Strings_gen.cpp" LANGS = ["en", "es", "fr", "de", "it"] def cpp_escape(s: str) -> str: out = [] for ch in s: if ch == "\\": out.append("\\\\") elif ch == '"': out.append('\\"') elif ch == "\n": out.append("\\n") elif ch == "\r": out.append("\\r") elif ch == "\t": out.append("\\t") else: out.append(ch) return "".join(out) def parse_csv(path: Path): """Return list of (id, {lang: text}). Skips comment rows starting with '#'.""" rows = [] with path.open("r", encoding="utf-8", newline="") as f: reader = csv.reader(f) header = next(reader, None) if not header or header[0].lower() != "id": raise SystemExit(f"i18n: missing or unexpected header {header}") col = {h.lower(): i for i, h in enumerate(header)} # English is the fallback column, so it must exist before parsing rows. if "en" not in col: raise SystemExit("i18n: CSV must contain an 'en' column") for r in reader: if not r or not r[0].strip() or r[0].strip().startswith("#"): continue sid = r[0].strip() entry = {} for lang in LANGS: idx = col.get(lang) val = r[idx] if (idx is not None and idx < len(r)) else "" # Fall back to English when a translation is missing. entry[lang] = val if val.strip() else (entry.get("en") or r[col["en"]]) rows.append((sid, entry)) return rows def render_header(rows): lines = [] lines.append("// AUTO-GENERATED by scripts/gen_i18n.py — DO NOT EDIT MANUALLY.") lines.append("// Regenerated from i18n/strings.csv on every build.") lines.append("#pragma once") lines.append("#include ") lines.append("") lines.append("namespace i18n {") lines.append("") lines.append("enum class StringId : uint16_t {") for sid, _ in rows: lines.append(f" {sid},") lines.append(" COUNT") lines.append("};") lines.append("") lines.append("constexpr uint16_t kStringCount = static_cast( StringId::COUNT );") lines.append("constexpr uint8_t kLangCount = 5; // EN, ES, FR, DE, IT") lines.append("") lines.append("namespace gen {") lines.append("") lines.append("/**") lines.append(" * @brief Return translation (@p lang, @p index) as a NUL-terminated string.") lines.append(" *") lines.append(" * Points into the packed flash pool; the returned pointer stays valid for") lines.append(" * the program lifetime. No bounds checking — callers must ensure") lines.append(" * @p lang < @c kLangCount and @p index < @c kStringCount.") lines.append(" */") lines.append("const char* string( uint8_t lang, uint16_t index );") lines.append("") lines.append("} // namespace gen") lines.append("") lines.append("} // namespace i18n") lines.append("") return "\n".join(lines) def build_pool(rows): """Build a tail-merged string pool. Returns (emit, offsets) where `emit` is the list of strings appended to the pool in order (each contributes its UTF-8 bytes + a NUL) and `offsets` maps every unique string to its byte offset into the concatenated blob. Strings are placed longest-first so that any string which is the suffix of another collapses onto the longer one's bytes (e.g. "ancel" reuses the tail of "Cancel"); identical strings are deduplicated outright. """ uniq = sorted( {entry[lang] for _, entry in rows for lang in LANGS}, key=lambda s: (-len(s.encode("utf-8")), s), ) blob = bytearray() offsets = {} emit = [] for s in uniq: needle = s.encode("utf-8") + b"\0" pos = bytes(blob).find(needle) if pos >= 0: offsets[s] = pos else: offsets[s] = len(blob) blob += needle emit.append(s) return emit, offsets, len(blob) def render_cpp(rows): emit, offsets, blob_len = build_pool(rows) # +1 keeps the string literal's implicit terminator so the array size is not # one short of the initializer (which -Werror rejects). pool_size = blob_len + 1 lines = [] lines.append("// AUTO-GENERATED by scripts/gen_i18n.py — DO NOT EDIT MANUALLY.") lines.append('#include "Strings_gen.h"') lines.append("") lines.append("#include ") lines.append("#include ") lines.append("") lines.append("namespace i18n {") lines.append("namespace gen {") lines.append("") lines.append("// Packed translation pool: all strings concatenated into one flash blob,") lines.append("// deduplicated and tail-merged (a string that is the suffix of another") lines.append("// shares its bytes). Adjacent string literals concatenate; the offsets") lines.append("// below index into the resulting bytes.") lines.append("// clang-format off") lines.append(f"constexpr std::array kPool = {{") running = 0 for s in emit: lines.append(f' "{cpp_escape(s)}\\0" // @{running}') running += len(s.encode("utf-8")) + 1 lines.append("};") lines.append("// clang-format on") lines.append("") lines.append("// Per-language byte offsets into kPool. uint16_t keeps this table half the") lines.append("// size of a 32-bit pointer table and free of load-time relocations.") lines.append("// clang-format off") lines.append( "constexpr std::array, kLangCount> kOffsets = { {" ) for lang in LANGS: offs = [offsets[entry[lang]] for _, entry in rows] lines.append(f" /* {lang.upper()} */ {{ {{") for i in range(0, len(offs), 12): chunk = ", ".join(str(o) for o in offs[i : i + 12]) lines.append(f" {chunk},") lines.append(" } },") lines.append("} };") lines.append("// clang-format on") lines.append("") lines.append( 'static_assert( kPool.size() <= UINT16_MAX, "pool exceeds uint16_t offset range" );' ) lines.append("") lines.append("const char* string( uint8_t lang, uint16_t index ) {") lines.append(" return kPool.data() + kOffsets[lang][index];") lines.append("}") lines.append("") lines.append("} // namespace gen") lines.append("} // namespace i18n") lines.append("") return "\n".join(lines) def write_if_changed(path: Path, content: str) -> bool: """Write only if content differs (avoids triggering recompilation).""" path.parent.mkdir(parents=True, exist_ok=True) if path.exists(): old = path.read_text(encoding="utf-8") if old == content: return False path.write_text(content, encoding="utf-8") return True def generate(): if not CSV_PATH.exists(): print(f"i18n: missing {CSV_PATH}", file=sys.stderr) return False rows = parse_csv(CSV_PATH) if not rows: print("i18n: CSV produced no rows", file=sys.stderr) return False h_changed = write_if_changed(HDR_PATH, render_header(rows)) c_changed = write_if_changed(CPP_PATH, render_cpp(rows)) if h_changed or c_changed: print(f"i18n: regenerated {len(rows)} ids x {len(LANGS)} langs") return True # PlatformIO entry point. try: Import("env") # type: ignore[name-defined] # noqa: F821 generate() except NameError: # Standalone CLI invocation. if __name__ == "__main__": generate()