conductor(deob_pass3): cs229_building_llms - LLM forward pass with duffle byte-width types

2026-06-23 20:54:49 -04:00
parent 7b60ef488d
commit e5113cb434
4 changed files with 383 additions and 0 deletions
@@ -0,0 +1,239 @@
+/* ============================================================================
+ *  cs229_building_llms.c — Pass 3 projection of the CS229 "Building LLMs" lecture
+ * ============================================================================
+ *
+ *  PURPOSE
+ *  -------
+ *  A small C11 program that demonstrates the constructive form of the lecture's
+ *  six pillars (Architecture, Training algorithm/loss, Data, Evaluation,
+ *  Systems, Model) using the duffle + forth bootslop conventions.
+ *
+ *  The program illustrates:
+ *    - Byte Pair Encoding (BPE) tokenization
+ *    - Autoregressive forward pass:  p(X_t | X_1..X_{t-1})
+ *    - Softmax projection (the linear layer from hidden size to vocab size)
+ *    - Cross-entropy loss:  L_CE = - sum_t log p_theta(X_t | X_<t)
+ *    - Chinchilla scaling laws:  N_opt(C) = a * C^0.5,  D_opt(C) = b * C^0.5
+ *    - KV-cache memory:  Memory_KV = 2 * B * S * L * H * D * bytes
+ *
+ *  The program does not implement a real transformer; it expresses the
+ *  SHAPE of the lecture's six pillars in code (per the lexicon Rule 4:
+ *  lossless + bounded + constructively typed + form-anchored).
+ *
+ *  ENCODING (per lexicon v2 Rule 5)
+ *  --------------------------------
+ *    Token     : integer (placeholder), resolved as int32 in the BPE id table
+ *    Logit     : float  (placeholder), resolved as float32 at runtime
+ *    Scalar    : float  (placeholder), resolved as float32 in math ops
+ *    Quantity  : float  (placeholder), resolved as float32 for loss values
+ *
+ *  SEE ALSO
+ *  --------
+ *  - cs229_building_llms_translation.md : math-to-C11 translation table
+ *  - cs229_building_llms_decoder.md     : per-term decoder (tier-categorized)
+ *  - cs229_building_llms_notes.md       : decisions, alternatives, overrides
+ *  - lexicon.md §2 (the 4 tiers), §9 (per-language rendering)
+ *  - c11_convention.md §2-§9 (duffle + forth bootslop conventions)
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef INTELLISENSE_DIRECTIVES
+#	pragma once
+#	include "dsl.h"
+#	include "math.h"
+#	include "memory.h"
+#endif
+
+#pragma region Types
+
+typedef uint32_t  TSet_(Token);
+typedef int32_t   TSet_(S32);
+typedef uint32_t  TSet_(U32);
+typedef float     TSet_(F32);
+
+typedef Struct_(Scalar)        { F32 value; };
+typedef Struct_(Logits)        { F32_R data; U4 len; };
+typedef Struct_(Probability)   { F32 value; };
+typedef Struct_(TokenSequence) { Token_R ids; U4 len; U4 cap; };
+typedef Struct_(Vocabulary)    { U4 vocab_size; Token_R id_to_token; U4_R token_to_id; };
+typedef Struct_(BPEMerge)      { Token a; Token b; Token merged; U4 count; };
+typedef Struct_(BPEMerger)     { BPEMerge_R merges; U4 len; U4 cap; U4 vocab_size; };
+
+#pragma endregion Types
+
+#pragma region BPE Tokenization
+
+/*- rjf: BPE training is a greedy compression-based procedure.
+ *      Start with the character vocabulary; iteratively merge the most
+ *      frequent pair; stop at target_vocab_size.
+ *      Per Tier 2 #2.6: 'tokenize' is a Relation between Text and Seq[Token]. */
+I_ BPEMerger bpe_train(Slice corpus, U4 target_vocab_size) {
+ BPEMerger m = {0};
+ (void)corpus;
+ (void)target_vocab_size;
+ return m;
+}
+
+/*- rjf: BPE encode — apply the learned merges to a text slice.
+ *      The encoding is bounded: the output is a sequence of Token : int32. */
+I_ TokenSequence bpe_encode(Slice text, BPEMerger_R m) {
+ TokenSequence out = {0};
+ (void)text;
+ (void)m;
+ return out;
+}
+
+#pragma endregion BPE Tokenization
+
+#pragma region Autoregressive Forward Pass
+
+/*- rjf: Softmax projection — the linear layer from hidden size d to vocab |V|.
+ *      softmax(z_i) = exp(z_i) / sum_j exp(z_j)
+ *      Per Tier 2 #2.3: this is the 'result' of the projection.
+ *      NOTE: subtract max(z) for numerical stability (the bounded form). */
+I_ Probability softmax_at(Logits_R z, U4 i) {
+ F32 max_val = z->data[0];
+ for (U4 j = 1; j < z->len; ++ j) {
+  if (z->data[j] > max_val) { max_val = z->data[j]; }
+ }
+ F32 sum_exp = (F32)0;
+ for (U4 j = 0; j < z->len; ++ j) {
+  sum_exp += expf(z->data[j] - max_val);
+ }
+ Probability p = { .value = expf(z->data[i] - max_val) / sum_exp };
+ return p;
+}
+
+/*- rjf: AR forward pass — predict the next token given previous tokens.
+ *      p(X_t | X_1..X_{t-1}) : this is the AR form.
+ *      The function returns the logits over the vocabulary for the next token. */
+I_ Logits ar_forward(TokenSequence_R context, U4 hidden_dim, U4 vocab_size) {
+ Logits out = { .data = 0, .len = vocab_size };
+ (void)context;
+ (void)hidden_dim;
+ assert(hidden_dim > 0);
+ assert(vocab_size > 0);
+ return out;
+}
+
+/*- rjf: AR sample — sample a token from the predicted distribution.
+ *      Per the lecture: at inference, we sample from the AR distribution. */
+I_ Token ar_sample(Logits_R logits) {
+ assert(logits->len > 0);
+ Token best = 0;
+ F32 best_val = logits->data[0];
+ for (U4 i = 1; i < logits->len; ++ i) {
+  if (logits->data[i] > best_val) {
+   best_val = logits->data[i];
+   best = i;
+  }
+ }
+ return best;
+}
+
+#pragma endregion Autoregressive Forward Pass
+
+#pragma region Cross-Entropy Loss
+
+/*- rjf: Cross-entropy loss for one sequence.
+ *      L_CE = - sum (t in 1..L) of log p_theta(X_t | X_1..X_{t-1})
+ *      Per the lecture: this is THE training loss for autoregressive LMs.
+ *      Returns: Quantity : float (placeholder; resolved as float32 in the function). */
+I_ Scalar cross_entropy_loss(TokenSequence_R tokens, U4 hidden_dim, U4 vocab_size) {
+ Scalar total = { .value = (F32)0 };
+ for (U4 t = 0; t + 1 < tokens->len; ++ t) {
+  TokenSequence prefix = { .ids = tokens->ids, .len = t + 1, .cap = t + 1 };
+  Logits logits = ar_forward(&prefix, hidden_dim, vocab_size);
+  Probability p = softmax_at(&logits, tokens->ids[t + 1]);
+  total.value -= logf(p.value + 1e-9f);
+ }
+ return total;
+}
+
+#pragma endregion Cross-Entropy Loss
+
+#pragma region Chinchilla Scaling
+
+/*- rjf: Chinchilla optimal model size for a compute budget C.
+ *      N_opt(C) = a * C^0.5
+ *      Per the lecture: compute-optimal ratio is ~20 tokens/param. */
+I_ F32 chinchilla_optimal_N(F32 compute_budget_flops) {
+ F32 a = 0.29f;
+ return a * powf(compute_budget_flops, 0.5f);
+}
+
+/*- rjf: Chinchilla optimal dataset size for a compute budget C.
+ *      D_opt(C) = b * C^0.5
+ *      Per the lecture: compute-optimal ratio is ~20 tokens/param. */
+I_ F32 chinchilla_optimal_D(F32 compute_budget_flops) {
+ F32 b = 5.7f;
+ return b * powf(compute_budget_flops, 0.5f);
+}
+
+/*- rjf: Back-of-envelope training cost for a Llama-3-400B-style run.
+ *      FLOPs = 6 * N * D
+ *      Per the lecture: ~3.79e25 FLOPs for Llama 3 400B (D = 15.6T tokens).
+ *      Encoding: float (placeholder; resolved as float64 in the back-of-envelope). */
+I_ F32 training_flops(F32 N_params, F32 D_tokens) {
+ return 6.0f * N_params * D_tokens;
+}
+
+#pragma endregion Chinchilla Scaling
+
+#pragma region KV-Cache Memory
+
+/*- rjf: KV-cache memory footprint.
+ *      Memory_KV = 2 * B * S * L * H * D * bytes_per_element
+ *      Per the lecture: pre-training vs inference throughput differ
+ *      because the KV-cache only grows at inference.
+ *      bytes_per_element is float16 by default (2 bytes). */
+I_ U64 kv_cache_bytes(U4 batch, U4 seq_len, U4 n_layers, U4 n_heads, U4 head_dim, U4 bytes_per_elem) {
+ U64 per_layer = (U64)batch * seq_len * n_heads * head_dim * bytes_per_elem;
+ return 2ULL * per_layer * n_layers;
+}
+
+#pragma endregion KV-Cache Memory
+
+#pragma region Main
+
+I_ S32 main(S32 argc, char *argv[]) {
+ (void)argc;
+ (void)argv;
+
+ U4 hidden_dim = 4096;
+ U4 vocab_size = 32000;
+ U4 target_vocab = vocab_size;
+
+ Slice corpus = { .ptr = 0, .len = 0 };
+ BPEMerger bpe = bpe_train(corpus, target_vocab);
+
+ TokenSequence tokens = {0};
+ Logits logits = ar_forward(&tokens, hidden_dim, vocab_size);
+ Token next = ar_sample(&logits);
+ (void)next;
+
+ Scalar loss = cross_entropy_loss(&tokens, hidden_dim, vocab_size);
+ (void)loss;
+
+ F32 compute_budget = 3.79e25f;
+ F32 N_opt = chinchilla_optimal_N(compute_budget);
+ F32 D_opt = chinchilla_optimal_D(compute_budget);
+ (void)N_opt;
+ (void)D_opt;
+
+ F32 flops = training_flops(4.0e11f, 1.56e13f);
+ (void)flops;
+
+ U64 kv_bytes = kv_cache_bytes(1, 2048, 32, 32, 128, 2);
+ (void)kv_bytes;
+
+ return 0;
+}
+
+#pragma endregion Main
@@ -0,0 +1,58 @@
+# cs229_building_llms — Per-term Decoder (tier-categorized)
+
+**Source:** `cs229_building_llms_deobfuscated.md` (Pass 2 deobfuscation)
+**Target:** `cs229_building_llms.c` (the C11 program)
+**Method:** Per v2 lexicon §1.3 (etymology) + §2 (the 4 tiers)
+
+## Tier 1: Core concepts
+
+| Term | C11 form | Etymology | Tier | Source |
+|---|---|---|---|---|
+| `Token` | `typedef uint32_t Token` | Old English *tacen* ("sign"); in LMs, an atomic unit of text | Tier 1 | Cluster 0, 2 |
+| `Logits` | `typedef struct { F32_R data; U4 len; } Logits` | Greek *λογιστικός* ("of reasoning"); raw pre-softmax scores | Tier 1 | Cluster 1 |
+| `Probability` | `typedef struct { F32 value; } Probability` | Latin *probabilitas* ("likelihood"); bounded to [0, 1] | Tier 1 | Tier 1 #1.13, Cluster 0 |
+| `Scalar` | `typedef struct { F32 value; } Scalar` | Latin *scalaris* ("of a ladder"); per-user placeholder for linear/geo/tensor alg | Tier 1 | Tier 4 #4.22 |
+
+## Tier 2: Data-oriented pipeline terms
+
+| Term | C11 form | Etymology | Tier | Source |
+|---|---|---|---|---|
+| `bpe_train` | procedure | abbreviation: Byte Pair Encoding + train | Tier 2 | Cluster 9 (FGED) |
+| `bpe_encode` | procedure | abbreviation: BPE + encode | Tier 2 | Cluster 9 |
+| `ar_forward` | procedure | abbreviation: autoregressive + forward pass | Tier 2 | Cluster 2 |
+| `ar_sample` | procedure | abbreviation: autoregressive + sample | Tier 2 | Cluster 2 |
+| `softmax_at` | procedure | softmax + index access | Tier 2 | Tier 4 #4.10 (user-specific `'scalar product'`) |
+| `cross_entropy_loss` | procedure | cross-entropy + loss | Tier 2 | Cluster 2 |
+| `chinchilla_optimal_N` | procedure | Chinchilla scaling law (Hoffmann et al. 2022) + optimal model size | Tier 2 | Cluster 2 |
+| `chinchilla_optimal_D` | procedure | Chinchilla + optimal dataset size | Tier 2 | Cluster 2 |
+| `training_flops` | procedure | FLOPs = 6 * N * D (the back-of-envelope rule) | Tier 2 | Cluster 2 |
+| `kv_cache_bytes` | procedure | KV-cache memory footprint | Tier 2 | Cluster 2 |
+
+## Tier 3: Type-theoretic primitives
+
+| Term | C11 form | Etymology | Tier | Source |
+|---|---|---|---|---|
+| `TokenSequence` | `typedef struct { Token_R ids; U4 len; U4 cap; } TokenSequence` | sequence of tokens; pair of (ptr, len, cap) | Tier 3 | Cluster 3 (Pair<A, B>) |
+| `Vocabulary` | `typedef struct { U4 vocab_size; Token_R id_to_token; U4_R token_to_id; } Vocabulary` | the lookup table; bidirectional id<->token | Tier 3 | Cluster 3 (Map<name, Tensor>) |
+| `BPEMerge` | `typedef struct { Token a; Token b; Token merged; U4 count; } BPEMerge` | one BPE merge rule | Tier 3 | Cluster 3 |
+| `BPEMerger` | `typedef struct { BPEMerge_R merges; U4 len; U4 cap; U4 vocab_size; } BPEMerger` | the learned BPE merger (a sequence of merges) | Tier 3 | Cluster 3 (Sequence) |
+| `Slice` | `typedef struct { U4 ptr, len; } Slice` | (ptr, len) pair; per `c11_convention.md` §8 | Tier 3 | Cluster 3 (Pair) |
+
+## Tier 4: AI-fuzzing tolerance terms
+
+| Term | C11 form | Etymology | Tier | Source |
+|---|---|---|---|---|
+| `hidden_dim` | `U4` parameter | the embedding dimension d; bounded integer | Tier 4 | cs229 §2.6 |
+| `vocab_size` | `U4` parameter | the vocabulary size |V|; bounded integer | Tier 4 | cs229 §2.6 |
+| `compute_budget` | `F32` parameter | the FLOPs budget C; placeholder, resolved as float32 | Tier 4 | cs229 §2.4 (R1) |
+| `correlation` | (not used in this video) | `correlation : float64` (per v2 lexicon Tier 4 #4.25, R1 NEW v2) | Tier 4 | cs229 §2.6 |
+| `weakly_coupled` | (not used in this video) | `weakly_coupled(a, b, tolerance)` (per v2 lexicon §9.1) | Tier 4 | v2 lexicon §9.1 |
+
+## Etymology notes (per Cluster 7, Pattern 3)
+
+- `Token` — Old English *tacen* ("sign"); modern usage: atomic unit of text after tokenization.
+- `Logits` — Greek *λογιστικός* via Latin *logisticus*; modern ML usage: raw pre-softmax scores.
+- `Probability` — Latin *probabilitas* from *probare* ("to test"); modern usage: a value in [0, 1].
+- `Scalar` — Latin *scalaris* from *scala* ("ladder"); modern usage: a single quantity (vs. vector/matrix).
+- `Chinchilla` — Hoffmann et al. 2022, "Training Compute-Optimal Large Language Models"; the paper name.
+- `BPE` — Sennrich et al. 2016, "Neural Machine Translation of Rare Words with Subword Units"; the algorithm name.
@@ -0,0 +1,63 @@
+# cs229_building_llms — Pass 3 Notes
+
+**Track:** `video_analysis_deob_pass3_20260623`
+**Date:** 2026-06-23
+**Language:** C11 (per the per-language default in `TIER2_STARTER.md` §3)
+
+## Decisions made
+
+1. **Language:** C11 (default; per `TIER2_STARTER.md` §3 cluster A row 1).
+2. **Conventions:** duffle + forth bootslop + raddbg fallback (per `c11_convention.md`).
+3. **Header style:** design-doc header (per `c11_convention.md` §9).
+4. **Type system:** byte-width types (`U4`, `F32`) + `TSet_` / `Struct_` / `Opt_` / `Ret_` macros.
+5. **Encoding placeholders:** `float` / `integer` / `Scalar` (per v2 lexicon §7). Resolved as concrete C11 types at the function signature.
+
+## Alternatives considered
+
+1. **Jai:** could have used Jai for parametric types. Rejected because the user specified C11.
+2. **Odin:** could have used Odin for the `Vector` types. Rejected for the same reason.
+3. **Real transformer implementation:** could have implemented a tiny transformer. Rejected because the goal is to EXPRESS THE CONCEPTS, not to train a model. Per user 2026-06-23, the code "may or may not run."
+
+## Language override (none)
+
+Per `TIER2_STARTER.md` §3, the default for this video is C11. No override applied.
+
+## 4 + 3 verification criteria (per v2 lexicon §7 of `TIER2_STARTER.md`)
+
+| # | Criterion | Status | Notes |
+|---|---|---|---|
+| 1 | **Lossless** | met | All 10 concepts from the translation table are represented in the C11 code. |
+| 2 | **Bounded** | met | No `∞_val`; all values are finite (Token, F32, U4, U64). |
+| 3 | **Constructively typed** | met | Every expression has a type (`Token`, `Logits`, `Probability`, `Scalar`, etc.). |
+| 4 | **Etymology-cited** | met | Every new term has 1-line origin + 1-line history in the decoder. |
+| 5 | **Encoding-explicit** | met | Every value-bearing term has an encoding (`Token : int32`, `F32` etc.). |
+| 6 | **Form-anchored** | met | Every re-encoding has a form anchor in the translation table. |
+| 7 | **User-specific opt-in** | met | The principled form is produced; the user-specific form (e.g., Sectored Language `'scalar product'`) is opt-in. |
+
+## Hardware target (per v2 lexicon §7 of `TIER2_STARTER.md`)
+
+Per user 2026-06-23, "target up to 10k." Default workstation: Ryzen 9 / i9, RTX 4090, 128GB DDR5, 4TB NVMe.
+
+This video's concepts map to:
+- **Pretraining** (cs229 §2.3-2.4): requires RTX 4090-class GPU + 128GB DDR5 for a 7B-param model in float16. The back-of-envelope Llama 3 400B numbers (3.79e25 FLOPs) imply a much larger cluster (16k+ H100s).
+- **Inference** (cs229 §2.6): KV-cache for a 7B model is `2 * 1 * 2048 * 32 * 32 * 128 * 2 = 1.07 GB` per request (F32 here for clarity; F16 halves this to 0.54 GB). RTX 4090 has 24GB VRAM, so ~20 concurrent requests fit.
+
+## Refinements discovered (Pass 3 → lexicon v3 candidates)
+
+1. **Per-pillar encoding tags:** the lecture's six pillars (Architecture, Training, Data, Evaluation, Systems, Model) could each have a tier tag in the lexicon (currently they are Tier 2 terms mixed). The C11 code shows that the pillars map to distinct type categories; a v3 lexicon could formalize this.
+
+## Gaps identified (concepts the code couldn't capture)
+
+1. **DPO / RLHF training loop:** the lecture covers DPO (Bradley-Terry objective) at the end. Not included in this C11 program because the focus is the forward pass + loss.
+2. **Mixture of Experts (MoE) routing:** the lecture briefly touches on MoE. Not included here.
+3. **Distributed training sharding:** the lecture discusses data, tensor, and pipeline parallelism. Not included here.
+
+## See also
+
+- `cs229_building_llms.c` — the C11 program
+- `cs229_building_llms_translation.md` — the math → C11 translation table
+- `cs229_building_llms_decoder.md` — the per-term decoder (tier-categorized)
+- `conductor/tracks/video_analysis_deob_pilot_20260621/artifacts/cs229_building_llms/` — the Pass 2 input
+- `conductor/tracks/video_analysis_cs229_building_llms_20260621/report.md` — the Pass 1 source
+- `conductor/tracks/video_analysis_deob_c11_reference_20260623/c11_convention.md` — the C11 style guide
+- `conductor/tracks/video_analysis_deob_lexicon_20260621/lexicon.md` — the v2 lexicon
@@ -0,0 +1,23 @@
+# cs229_building_llms — Translation Table (math → C11)
+
+**Source:** `conductor/tracks/video_analysis_deob_pilot_20260621/artifacts/cs229_building_llms/cs229_building_llms_deobfuscated.md` (464 lines)
+**Target:** `cs229_building_llms.c` (the C11 program)
+**Method:** Per v2 lexicon Rule 2 (form-anchor) + Rule 5 (encoding-explicit)
+
+| # | Math / concept | C11 form | Form anchor | Encoding |
+|---|---|---|---|---|
+| 1 | `p : (Token^L) -> Probability` | `Logits ar_forward(TokenSequence_R context, ...)` | bounded: finite vocab + finite sequence length | `Logits : float` |
+| 2 | `p(X_t \| X_1..X_{t-1})` | prefix-arg in `ar_forward` | bounded: prefix length t | `Token : int32` |
+| 3 | `softmax(z_i) = exp(z_i) / sum_j exp(z_j)` | `softmax_at(Logits_R z, U4 i)` | bounded: subtract max for stability | `F32` (resolved) |
+| 4 | `BPE_Train : (corpus, target_vocab_size) -> Vocab` | `BPEMerger bpe_train(Slice, U4)` | bounded: target_vocab_size as a hard cap | `U4` |
+| 5 | `Tokenize : (Text) -> Seq[Token]` | `TokenSequence bpe_encode(Slice, BPEMerger_R)` | bounded: sequence length cap | `Token : int32` |
+| 6 | `L_CE = - sum_t log p_theta(X_t \| X_1..X_{t-1})` | `cross_entropy_loss(TokenSequence_R, ...)` | bounded: sum over sequence length | `Scalar : float` |
+| 7 | `N_opt(C) = a * C^0.5` | `chinchilla_optimal_N(F32 compute_budget_flops)` | bounded: compute_budget as input | `F32` |
+| 8 | `D_opt(C) = b * C^0.5` | `chinchilla_optimal_D(F32 compute_budget_flops)` | bounded: compute_budget as input | `F32` |
+| 9 | `FLOPs = 6 * N * D` | `training_flops(F32 N_params, F32 D_tokens)` | bounded: N, D as finite inputs | `F32` |
+| 10 | `Memory_KV = 2 * B * S * L * H * D * bytes_per_element` | `kv_cache_bytes(U4 batch, U4 seq_len, U4 n_layers, U4 n_heads, U4 head_dim, U4 bytes_per_elem)` | bounded: B, S, L, H, D, bytes all finite | `U64` |
+
+**Notes:**
+- The C11 program does NOT implement a real transformer; it expresses the SHAPE of the lecture's six pillars.
+- All `float` / `integer` / `Scalar` placeholders resolve to concrete C11 types (`F32`, `U4`, etc.) at the function signature.
+- Per the v2 lexicon §9.1, `<<` / `>>` operators are rendered as `much_less` / `much_greater` / `weakly_coupled` if needed; this file does not use them.