From bb2a4843ae3fbbb1e600cd57d1b27ed6223192a8 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Mon, 22 Jun 2026 01:15:35 -0400 Subject: [PATCH] conductor(cs336_architectures): Phase 1 Acquire - transcript (2626 clean segments, 93KB) + 196MB mp4 --- .../artifacts/phase1.log | 11 + .../artifacts/transcript.json | 36894 ++++++++++++++++ .../artifacts/transcript_clean.txt | 2626 ++ .../artifacts/video.log | 16 + 4 files changed, 39547 insertions(+) create mode 100644 conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/phase1.log create mode 100644 conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript.json create mode 100644 conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript_clean.txt create mode 100644 conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/video.log diff --git a/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/phase1.log b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/phase1.log new file mode 100644 index 00000000..51960ec3 --- /dev/null +++ b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/phase1.log @@ -0,0 +1,11 @@ +Phase 1 Acquire for cs336_architectures: https://youtu.be/lVynu4bo1rY +Artifacts: C:\projects\manual_slop\conductor\tracks\video_analysis_cs336_architectures_20260621\artifacts +Step 1: extract_transcript (yt-dlp VTT directly) + OK: wrote C:\projects\manual_slop\conductor\tracks\video_analysis_cs336_architectures_20260621\artifacts\transcript.json (5276 segments) +Step 2: download_video + OK: wrote C:\projects\manual_slop\conductor\tracks\video_analysis_cs336_architectures_20260621\artifacts\video.mp4 (205714594 bytes) +{ + "status": "ok", + "video_path": "C:\\projects\\manual_slop\\conductor\\tracks\\video_analysis_cs336_architectures_20260621\\artifacts\\video.mp4", + "transcript_path": "C:\\projects\\manual_slop\\conductor\\tracks\\video_analysis_cs336_architectures_20260621\\artifacts\\transcript.json" +} diff --git a/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript.json b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript.json new file mode 100644 index 00000000..a75ca396 --- /dev/null +++ b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript.json @@ -0,0 +1,36894 @@ +{ + "video_id": "lVynu4bo1rY", + "segments": [ + { + "start": 5.64, + "duration": 0.0, + "text": "So<00:00:05.800> today<00:00:06.040> we're<00:00:06.160> going<00:00:06.280> to<00:00:06.320> talk<00:00:06.520> about" + }, + { + "start": 6.79, + "duration": 0.0, + "text": "So today we're going to talk about" + }, + { + "start": 6.8, + "duration": 0.0, + "text": "So today we're going to talk about architecture,<00:00:07.520> which<00:00:08.200> at<00:00:08.320> least<00:00:08.640> to<00:00:08.760> me<00:00:09.240> has" + }, + { + "start": 9.43, + "duration": 0.0, + "text": "architecture, which at least to me has" + }, + { + "start": 9.44, + "duration": 0.0, + "text": "architecture, which at least to me has always<00:00:09.720> been<00:00:09.880> pretty<00:00:10.240> inscrutable." + }, + { + "start": 11.99, + "duration": 0.0, + "text": "always been pretty inscrutable." + }, + { + "start": 12.0, + "duration": 0.0, + "text": "always been pretty inscrutable. Um<00:00:12.080> and<00:00:12.240> so<00:00:12.360> I'm<00:00:12.440> going<00:00:12.560> to<00:00:12.640> take<00:00:12.920> the<00:00:13.040> approach" + }, + { + "start": 13.63, + "duration": 0.0, + "text": "Um and so I'm going to take the approach" + }, + { + "start": 13.64, + "duration": 0.0, + "text": "Um and so I'm going to take the approach of<00:00:13.760> just<00:00:14.080> telling<00:00:14.440> you<00:00:14.640> kind<00:00:14.800> of<00:00:15.200> everything," + }, + { + "start": 16.03, + "duration": 0.0, + "text": "of just telling you kind of everything," + }, + { + "start": 16.04, + "duration": 0.0, + "text": "of just telling you kind of everything, right?<00:00:16.280> I'm<00:00:16.320> going<00:00:16.440> to<00:00:16.520> go<00:00:16.680> through<00:00:16.960> all<00:00:17.160> of" + }, + { + "start": 17.23, + "duration": 0.0, + "text": "right? I'm going to go through all of" + }, + { + "start": 17.24, + "duration": 0.0, + "text": "right? I'm going to go through all of the<00:00:17.320> modern<00:00:17.840> papers." + }, + { + "start": 19.31, + "duration": 0.0, + "text": "the modern papers." + }, + { + "start": 19.32, + "duration": 0.0, + "text": "the modern papers. Um<00:00:19.440> and<00:00:19.560> we're<00:00:19.680> going<00:00:19.840> to<00:00:19.920> just<00:00:20.200> look<00:00:20.440> through" + }, + { + "start": 20.91, + "duration": 0.0, + "text": "Um and we're going to just look through" + }, + { + "start": 20.92, + "duration": 0.0, + "text": "Um and we're going to just look through what<00:00:21.160> has<00:00:21.320> everyone<00:00:21.720> done?<00:00:22.360> Um<00:00:22.560> and<00:00:22.680> so<00:00:22.760> I've" + }, + { + "start": 22.87, + "duration": 0.0, + "text": "what has everyone done? Um and so I've" + }, + { + "start": 22.88, + "duration": 0.0, + "text": "what has everyone done? Um and so I've titled<00:00:23.200> this<00:00:23.360> everything<00:00:23.720> you<00:00:23.840> didn't<00:00:24.200> want" + }, + { + "start": 24.39, + "duration": 0.0, + "text": "titled this everything you didn't want" + }, + { + "start": 24.4, + "duration": 0.0, + "text": "titled this everything you didn't want to<00:00:24.480> know<00:00:25.160> about<00:00:25.440> architectures<00:00:26.000> and" + }, + { + "start": 26.11, + "duration": 0.0, + "text": "to know about architectures and" + }, + { + "start": 26.12, + "duration": 0.0, + "text": "to know about architectures and hyperparameters<00:00:26.960> because<00:00:27.200> I<00:00:27.240> think<00:00:27.480> we<00:00:27.600> all" + }, + { + "start": 27.79, + "duration": 0.0, + "text": "hyperparameters because I think we all" + }, + { + "start": 27.8, + "duration": 0.0, + "text": "hyperparameters because I think we all wished<00:00:28.600> we<00:00:28.760> lived<00:00:28.960> in<00:00:29.080> a<00:00:29.120> world<00:00:29.520> where<00:00:29.640> the" + }, + { + "start": 29.75, + "duration": 0.0, + "text": "wished we lived in a world where the" + }, + { + "start": 29.76, + "duration": 0.0, + "text": "wished we lived in a world where the only<00:00:30.000> things<00:00:30.240> you<00:00:30.320> had<00:00:30.520> to<00:00:30.600> know<00:00:30.760> were<00:00:30.880> like<00:00:31.080> VC" + }, + { + "start": 31.35, + "duration": 0.0, + "text": "only things you had to know were like VC" + }, + { + "start": 31.36, + "duration": 0.0, + "text": "only things you had to know were like VC dimension<00:00:31.760> or<00:00:31.840> something,<00:00:32.200> right?<00:00:32.320> Like<00:00:32.439> very" + }, + { + "start": 32.71, + "duration": 0.0, + "text": "dimension or something, right? Like very" + }, + { + "start": 32.72, + "duration": 0.0, + "text": "dimension or something, right? Like very simple,<00:00:33.560> you<00:00:33.680> know,<00:00:33.800> theoretical<00:00:34.320> tools,<00:00:34.640> but" + }, + { + "start": 34.71, + "duration": 0.0, + "text": "simple, you know, theoretical tools, but" + }, + { + "start": 34.72, + "duration": 0.0, + "text": "simple, you know, theoretical tools, but that's<00:00:34.840> not<00:00:35.040> really<00:00:35.640> where<00:00:35.800> we<00:00:35.960> are." + }, + { + "start": 36.95, + "duration": 0.0, + "text": "that's not really where we are." + }, + { + "start": 36.96, + "duration": 0.0, + "text": "that's not really where we are. So<00:00:37.040> okay.<00:00:37.800> What<00:00:38.000> we<00:00:38.120> are<00:00:38.200> going<00:00:38.440> to<00:00:38.560> do<00:00:39.320> is<00:00:39.800> we" + }, + { + "start": 39.95, + "duration": 0.0, + "text": "So okay. What we are going to do is we" + }, + { + "start": 39.96, + "duration": 0.0, + "text": "So okay. What we are going to do is we are<00:00:40.040> going<00:00:40.280> to<00:00:40.560> try<00:00:40.760> to<00:00:40.880> understand" + }, + { + "start": 41.47, + "duration": 0.0, + "text": "are going to try to understand" + }, + { + "start": 41.48, + "duration": 0.0, + "text": "are going to try to understand architecture<00:00:42.080> from<00:00:42.280> kind<00:00:42.560> of<00:00:42.640> like<00:00:42.880> a<00:00:43.120> survey" + }, + { + "start": 43.63, + "duration": 0.0, + "text": "architecture from kind of like a survey" + }, + { + "start": 43.64, + "duration": 0.0, + "text": "architecture from kind of like a survey lens,<00:00:44.360> right?<00:00:45.000> The<00:00:45.120> best<00:00:45.400> thing<00:00:45.560> to<00:00:45.680> do,<00:00:46.280> you" + }, + { + "start": 46.35, + "duration": 0.0, + "text": "lens, right? The best thing to do, you" + }, + { + "start": 46.36, + "duration": 0.0, + "text": "lens, right? The best thing to do, you know,<00:00:46.480> better<00:00:46.720> than<00:00:46.880> listening<00:00:47.120> to<00:00:47.200> this" + }, + { + "start": 47.39, + "duration": 0.0, + "text": "know, better than listening to this" + }, + { + "start": 47.4, + "duration": 0.0, + "text": "know, better than listening to this lecture<00:00:47.720> even<00:00:48.000> is<00:00:48.120> for<00:00:48.280> you<00:00:48.440> to<00:00:48.520> go<00:00:48.760> out<00:00:49.040> and" + }, + { + "start": 49.15, + "duration": 0.0, + "text": "lecture even is for you to go out and" + }, + { + "start": 49.16, + "duration": 0.0, + "text": "lecture even is for you to go out and like<00:00:49.280> train<00:00:49.520> your<00:00:49.680> own<00:00:49.840> models<00:00:50.280> and<00:00:50.400> try" + }, + { + "start": 50.55, + "duration": 0.0, + "text": "like train your own models and try" + }, + { + "start": 50.56, + "duration": 0.0, + "text": "like train your own models and try different<00:00:50.880> architectures,<00:00:51.400> right?<00:00:51.520> That's" + }, + { + "start": 51.79, + "duration": 0.0, + "text": "different architectures, right? That's" + }, + { + "start": 51.8, + "duration": 0.0, + "text": "different architectures, right? That's by<00:00:51.960> far<00:00:52.200> the<00:00:52.280> best<00:00:52.480> thing<00:00:52.600> to<00:00:52.720> do.<00:00:53.480> That's<00:00:53.680> part" + }, + { + "start": 53.87, + "duration": 0.0, + "text": "by far the best thing to do. That's part" + }, + { + "start": 53.88, + "duration": 0.0, + "text": "by far the best thing to do. That's part of<00:00:53.920> the<00:00:54.000> philosophy<00:00:54.400> of<00:00:54.440> the<00:00:54.560> course.<00:00:55.520> But" + }, + { + "start": 56.23, + "duration": 0.0, + "text": "of the philosophy of the course. But" + }, + { + "start": 56.24, + "duration": 0.0, + "text": "of the philosophy of the course. But we're<00:00:56.360> not<00:00:56.520> going<00:00:56.640> to<00:00:56.720> be<00:00:56.920> able<00:00:57.200> to<00:00:57.320> cover<00:00:57.560> the" + }, + { + "start": 57.67, + "duration": 0.0, + "text": "we're not going to be able to cover the" + }, + { + "start": 57.68, + "duration": 0.0, + "text": "we're not going to be able to cover the whole<00:00:57.840> design<00:00:58.280> space<00:00:58.640> of<00:00:58.800> all<00:00:58.960> the<00:00:59.080> different" + }, + { + "start": 59.39, + "duration": 0.0, + "text": "whole design space of all the different" + }, + { + "start": 59.4, + "duration": 0.0, + "text": "whole design space of all the different architectures<00:01:00.080> that<00:01:00.200> are<00:01:00.480> out<00:01:00.600> there,<00:01:00.920> right?" + }, + { + "start": 61.07, + "duration": 0.0, + "text": "architectures that are out there, right?" + }, + { + "start": 61.08, + "duration": 0.0, + "text": "architectures that are out there, right? Like<00:01:01.200> that's<00:01:01.360> not<00:01:01.480> something<00:01:01.840> that<00:01:01.960> we<00:01:02.080> have" + }, + { + "start": 62.19, + "duration": 0.0, + "text": "Like that's not something that we have" + }, + { + "start": 62.2, + "duration": 0.0, + "text": "Like that's not something that we have the<00:01:02.280> compute<00:01:02.680> or<00:01:02.800> the<00:01:02.920> time<00:01:03.120> to<00:01:03.240> do." + }, + { + "start": 64.31, + "duration": 0.0, + "text": "the compute or the time to do." + }, + { + "start": 64.32, + "duration": 0.0, + "text": "the compute or the time to do. So<00:01:04.720> my<00:01:04.920> opinion<00:01:05.320> is<00:01:05.440> the<00:01:05.560> second<00:01:06.040> best<00:01:06.280> thing" + }, + { + "start": 66.71, + "duration": 0.0, + "text": "So my opinion is the second best thing" + }, + { + "start": 66.72, + "duration": 0.0, + "text": "So my opinion is the second best thing that<00:01:06.840> we<00:01:06.960> could<00:01:07.120> do<00:01:07.720> is<00:01:07.880> to<00:01:08.000> try<00:01:08.160> to<00:01:08.320> learn<00:01:08.520> from" + }, + { + "start": 68.67, + "duration": 0.0, + "text": "that we could do is to try to learn from" + }, + { + "start": 68.68, + "duration": 0.0, + "text": "that we could do is to try to learn from the<00:01:08.760> experience<00:01:09.240> of<00:01:09.400> others,<00:01:10.080> right?<00:01:10.400> What" + }, + { + "start": 70.51, + "duration": 0.0, + "text": "the experience of others, right? What" + }, + { + "start": 70.52, + "duration": 0.0, + "text": "the experience of others, right? What has<00:01:10.880> What<00:01:11.040> has<00:01:11.200> everyone<00:01:11.560> else<00:01:11.800> done?<00:01:12.120> What" + }, + { + "start": 72.27, + "duration": 0.0, + "text": "has What has everyone else done? What" + }, + { + "start": 72.28, + "duration": 0.0, + "text": "has What has everyone else done? What are<00:01:12.360> the<00:01:12.480> choices<00:01:12.800> that<00:01:12.960> they<00:01:13.120> are<00:01:13.240> making," + }, + { + "start": 73.63, + "duration": 0.0, + "text": "are the choices that they are making," + }, + { + "start": 73.64, + "duration": 0.0, + "text": "are the choices that they are making, right?<00:01:14.360> And<00:01:14.480> by<00:01:14.600> looking<00:01:14.880> at<00:01:14.960> kind<00:01:15.120> of<00:01:15.200> a" + }, + { + "start": 75.27, + "duration": 0.0, + "text": "right? And by looking at kind of a" + }, + { + "start": 75.28, + "duration": 0.0, + "text": "right? And by looking at kind of a broader,<00:01:15.880> somewhat<00:01:16.200> zoomed<00:01:16.520> out<00:01:16.680> picture," + }, + { + "start": 77.35, + "duration": 0.0, + "text": "broader, somewhat zoomed out picture," + }, + { + "start": 77.36, + "duration": 0.0, + "text": "broader, somewhat zoomed out picture, maybe<00:01:17.680> we<00:01:17.760> can<00:01:17.920> start<00:01:18.200> to<00:01:18.280> understand,<00:01:18.960> oh," + }, + { + "start": 79.07, + "duration": 0.0, + "text": "maybe we can start to understand, oh," + }, + { + "start": 79.08, + "duration": 0.0, + "text": "maybe we can start to understand, oh, these<00:01:19.400> are<00:01:19.480> the<00:01:19.640> kinds<00:01:20.000> of<00:01:20.160> parameters<00:01:20.720> and" + }, + { + "start": 80.83, + "duration": 0.0, + "text": "these are the kinds of parameters and" + }, + { + "start": 80.84, + "duration": 0.0, + "text": "these are the kinds of parameters and choices<00:01:21.720> that<00:01:21.840> are<00:01:21.920> sort<00:01:22.080> of<00:01:22.200> fixed<00:01:22.560> across" + }, + { + "start": 82.91, + "duration": 0.0, + "text": "choices that are sort of fixed across" + }, + { + "start": 82.92, + "duration": 0.0, + "text": "choices that are sort of fixed across all<00:01:23.040> effective<00:01:23.480> architectures<00:01:24.400> and<00:01:24.520> these" + }, + { + "start": 84.67, + "duration": 0.0, + "text": "all effective architectures and these" + }, + { + "start": 84.68, + "duration": 0.0, + "text": "all effective architectures and these other<00:01:24.920> ones<00:01:25.160> can<00:01:25.320> be<00:01:25.400> sort<00:01:25.520> of<00:01:25.640> varied<00:01:26.040> without" + }, + { + "start": 86.35, + "duration": 0.0, + "text": "other ones can be sort of varied without" + }, + { + "start": 86.36, + "duration": 0.0, + "text": "other ones can be sort of varied without impacting" + }, + { + "start": 87.71, + "duration": 0.0, + "text": "impacting" + }, + { + "start": 87.72, + "duration": 0.0, + "text": "impacting how<00:01:27.880> the<00:01:28.000> model<00:01:28.280> performs,<00:01:28.800> right?<00:01:29.000> So<00:01:29.080> I'm" + }, + { + "start": 89.11, + "duration": 0.0, + "text": "how the model performs, right? So I'm" + }, + { + "start": 89.12, + "duration": 0.0, + "text": "how the model performs, right? So I'm going<00:01:29.240> to<00:01:29.320> talk<00:01:29.520> about,<00:01:30.400> you<00:01:30.480> know,<00:01:30.920> basically" + }, + { + "start": 91.19, + "duration": 0.0, + "text": "going to talk about, you know, basically" + }, + { + "start": 91.2, + "duration": 0.0, + "text": "going to talk about, you know, basically transformer<00:01:31.760> variants.<00:01:32.320> Like<00:01:32.640> what<00:01:32.880> is<00:01:33.040> the," + }, + { + "start": 93.43, + "duration": 0.0, + "text": "transformer variants. Like what is the," + }, + { + "start": 93.44, + "duration": 0.0, + "text": "transformer variants. Like what is the, you<00:01:33.520> know,<00:01:33.640> modern<00:01:34.000> transformer<00:01:34.600> starting" + }, + { + "start": 95.07, + "duration": 0.0, + "text": "you know, modern transformer starting" + }, + { + "start": 95.08, + "duration": 0.0, + "text": "you know, modern transformer starting with,<00:01:35.760> you<00:01:35.840> know,<00:01:35.960> the<00:01:36.040> Vaswani<00:01:36.520> paper<00:01:37.320> and" + }, + { + "start": 97.51, + "duration": 0.0, + "text": "with, you know, the Vaswani paper and" + }, + { + "start": 97.52, + "duration": 0.0, + "text": "with, you know, the Vaswani paper and then,<00:01:37.840> you<00:01:37.920> know,<00:01:38.360> as<00:01:38.520> we<00:01:38.680> go<00:01:38.880> to<00:01:39.040> more<00:01:39.280> modern," + }, + { + "start": 100.27, + "duration": 0.0, + "text": "then, you know, as we go to more modern," + }, + { + "start": 100.28, + "duration": 0.0, + "text": "then, you know, as we go to more modern, more<00:01:40.600> recent<00:01:41.040> architectures,<00:01:41.560> what<00:01:41.720> do<00:01:41.800> they" + }, + { + "start": 101.91, + "duration": 0.0, + "text": "more recent architectures, what do they" + }, + { + "start": 101.92, + "duration": 0.0, + "text": "more recent architectures, what do they have<00:01:42.120> in<00:01:42.200> common?<00:01:43.000> And<00:01:43.120> then<00:01:43.280> what<00:01:43.440> are<00:01:43.560> we" + }, + { + "start": 103.67, + "duration": 0.0, + "text": "have in common? And then what are we" + }, + { + "start": 103.68, + "duration": 0.0, + "text": "have in common? And then what are we allowed<00:01:43.960> to<00:01:44.040> vary?<00:01:44.320> Or<00:01:44.440> not<00:01:44.600> allowed,<00:01:44.960> but" + }, + { + "start": 105.11, + "duration": 0.0, + "text": "allowed to vary? Or not allowed, but" + }, + { + "start": 105.12, + "duration": 0.0, + "text": "allowed to vary? Or not allowed, but what<00:01:45.360> do<00:01:45.560> people<00:01:45.880> vary<00:01:46.640> as<00:01:46.800> they<00:01:46.920> go<00:01:47.080> through" + }, + { + "start": 107.27, + "duration": 0.0, + "text": "what do people vary as they go through" + }, + { + "start": 107.28, + "duration": 0.0, + "text": "what do people vary as they go through this,<00:01:47.680> right?" + }, + { + "start": 108.83, + "duration": 0.0, + "text": "this, right?" + }, + { + "start": 108.84, + "duration": 0.0, + "text": "this, right? So" + }, + { + "start": 109.83, + "duration": 0.0, + "text": "So" + }, + { + "start": 109.84, + "duration": 0.0, + "text": "So I<00:01:49.920> think<00:01:50.120> many<00:01:50.400> of<00:01:50.520> you<00:01:50.600> have<00:01:50.800> taken<00:01:51.120> an<00:01:51.240> NLP" + }, + { + "start": 111.59, + "duration": 0.0, + "text": "I think many of you have taken an NLP" + }, + { + "start": 111.6, + "duration": 0.0, + "text": "I think many of you have taken an NLP course<00:01:51.880> of<00:01:51.960> some<00:01:52.160> kind<00:01:52.680> or<00:01:52.920> at<00:01:53.000> least<00:01:53.200> seen<00:01:53.400> a" + }, + { + "start": 113.43, + "duration": 0.0, + "text": "course of some kind or at least seen a" + }, + { + "start": 113.44, + "duration": 0.0, + "text": "course of some kind or at least seen a transformer,<00:01:54.520> so<00:01:54.640> you've<00:01:54.800> probably<00:01:55.160> seen," + }, + { + "start": 115.55, + "duration": 0.0, + "text": "transformer, so you've probably seen," + }, + { + "start": 115.56, + "duration": 0.0, + "text": "transformer, so you've probably seen, you<00:01:55.640> know,<00:01:55.760> the<00:01:55.880> very<00:01:56.160> vanilla<00:01:56.720> transformer" + }, + { + "start": 117.31, + "duration": 0.0, + "text": "you know, the very vanilla transformer" + }, + { + "start": 117.32, + "duration": 0.0, + "text": "you know, the very vanilla transformer from<00:01:57.720> Vaswani<00:01:58.160> et<00:01:58.280> al.<00:01:59.040> Um<00:01:59.760> you<00:01:59.880> know,<00:02:00.440> there" + }, + { + "start": 121.55, + "duration": 0.0, + "text": "from Vaswani et al. Um you know, there" + }, + { + "start": 121.56, + "duration": 0.0, + "text": "from Vaswani et al. Um you know, there there<00:02:01.720> are<00:02:01.800> some<00:02:02.000> fairly<00:02:02.280> standard<00:02:02.640> choices" + }, + { + "start": 122.99, + "duration": 0.0, + "text": "there are some fairly standard choices" + }, + { + "start": 123.0, + "duration": 0.0, + "text": "there are some fairly standard choices that<00:02:03.160> you<00:02:03.240> make.<00:02:03.480> You<00:02:03.600> say,<00:02:03.800> oh,<00:02:04.080> transformers" + }, + { + "start": 124.79, + "duration": 0.0, + "text": "that you make. You say, oh, transformers" + }, + { + "start": 124.8, + "duration": 0.0, + "text": "that you make. You say, oh, transformers don't<00:02:05.040> have<00:02:05.280> positional<00:02:05.760> dependence,<00:02:06.320> so" + }, + { + "start": 126.43, + "duration": 0.0, + "text": "don't have positional dependence, so" + }, + { + "start": 126.44, + "duration": 0.0, + "text": "don't have positional dependence, so we're<00:02:06.520> going<00:02:06.640> to<00:02:06.720> add<00:02:06.800> a<00:02:06.840> position<00:02:07.240> embedding." + }, + { + "start": 128.15, + "duration": 0.0, + "text": "we're going to add a position embedding." + }, + { + "start": 128.16, + "duration": 0.0, + "text": "we're going to add a position embedding. And<00:02:08.320> what<00:02:08.440> do<00:02:08.479> we<00:02:08.600> do?<00:02:08.759> We're<00:02:08.840> going<00:02:08.960> to<00:02:09.039> add" + }, + { + "start": 129.109, + "duration": 0.0, + "text": "And what do we do? We're going to add" + }, + { + "start": 129.119, + "duration": 0.0, + "text": "And what do we do? We're going to add some<00:02:09.240> sines<00:02:09.520> and<00:02:09.640> cosines." + }, + { + "start": 131.07, + "duration": 0.0, + "text": "some sines and cosines." + }, + { + "start": 131.08, + "duration": 0.0, + "text": "some sines and cosines. Um<00:02:11.400> we're<00:02:11.520> going<00:02:11.640> to<00:02:11.720> have<00:02:12.200> information" + }, + { + "start": 132.75, + "duration": 0.0, + "text": "Um we're going to have information" + }, + { + "start": 132.76, + "duration": 0.0, + "text": "Um we're going to have information processing<00:02:13.360> through<00:02:13.720> a<00:02:14.000> ReLU.<00:02:15.080> Um<00:02:15.240> and<00:02:15.360> then" + }, + { + "start": 135.47, + "duration": 0.0, + "text": "processing through a ReLU. Um and then" + }, + { + "start": 135.48, + "duration": 0.0, + "text": "processing through a ReLU. Um and then we're<00:02:15.560> going<00:02:15.680> to<00:02:15.760> have<00:02:16.000> a<00:02:16.280> a<00:02:16.360> post<00:02:16.760> norm.<00:02:17.040> I'll" + }, + { + "start": 137.11, + "duration": 0.0, + "text": "we're going to have a a post norm. I'll" + }, + { + "start": 137.12, + "duration": 0.0, + "text": "we're going to have a a post norm. I'll talk<00:02:17.280> about<00:02:17.440> what<00:02:17.560> exactly<00:02:17.920> that<00:02:18.080> is<00:02:18.240> later." + }, + { + "start": 139.27, + "duration": 0.0, + "text": "talk about what exactly that is later." + }, + { + "start": 139.28, + "duration": 0.0, + "text": "talk about what exactly that is later. Um<00:02:19.480> and<00:02:19.600> when<00:02:19.720> you<00:02:19.800> look<00:02:19.960> at<00:02:20.040> your<00:02:20.160> assignment," + }, + { + "start": 140.87, + "duration": 0.0, + "text": "Um and when you look at your assignment," + }, + { + "start": 140.88, + "duration": 0.0, + "text": "Um and when you look at your assignment, your<00:02:21.240> A1,<00:02:22.160> you're<00:02:22.280> going<00:02:22.400> to<00:02:22.440> notice<00:02:22.680> some" + }, + { + "start": 142.83, + "duration": 0.0, + "text": "your A1, you're going to notice some" + }, + { + "start": 142.84, + "duration": 0.0, + "text": "your A1, you're going to notice some differences<00:02:23.440> between<00:02:24.160> the<00:02:24.360> standard<00:02:24.920> or<00:02:25.000> the" + }, + { + "start": 145.11, + "duration": 0.0, + "text": "differences between the standard or the" + }, + { + "start": 145.12, + "duration": 0.0, + "text": "differences between the standard or the vanilla<00:02:25.480> transformer<00:02:26.560> and<00:02:26.760> what<00:02:26.880> we've<00:02:27.040> asked" + }, + { + "start": 147.27, + "duration": 0.0, + "text": "vanilla transformer and what we've asked" + }, + { + "start": 147.28, + "duration": 0.0, + "text": "vanilla transformer and what we've asked you<00:02:27.360> to<00:02:27.480> implement.<00:02:28.000> Well,<00:02:28.200> we're<00:02:28.280> going<00:02:28.400> to" + }, + { + "start": 148.47, + "duration": 0.0, + "text": "you to implement. Well, we're going to" + }, + { + "start": 148.48, + "duration": 0.0, + "text": "you to implement. Well, we're going to ask<00:02:28.800> you<00:02:28.880> to<00:02:29.000> move<00:02:29.360> the<00:02:29.480> layer<00:02:29.800> norm<00:02:30.120> to<00:02:30.200> the" + }, + { + "start": 150.31, + "duration": 0.0, + "text": "ask you to move the layer norm to the" + }, + { + "start": 150.32, + "duration": 0.0, + "text": "ask you to move the layer norm to the front<00:02:30.680> of<00:02:30.800> each<00:02:30.960> transformer<00:02:31.480> block<00:02:31.880> or<00:02:32.160> the" + }, + { + "start": 152.27, + "duration": 0.0, + "text": "front of each transformer block or the" + }, + { + "start": 152.28, + "duration": 0.0, + "text": "front of each transformer block or the non-residual<00:02:32.920> layers.<00:02:33.840> We're<00:02:33.880> going<00:02:34.000> to<00:02:34.040> ask" + }, + { + "start": 154.23, + "duration": 0.0, + "text": "non-residual layers. We're going to ask" + }, + { + "start": 154.24, + "duration": 0.0, + "text": "non-residual layers. We're going to ask you<00:02:34.320> to<00:02:34.400> implement<00:02:34.720> something<00:02:34.960> called<00:02:35.200> rope." + }, + { + "start": 156.03, + "duration": 0.0, + "text": "you to implement something called rope." + }, + { + "start": 156.04, + "duration": 0.0, + "text": "you to implement something called rope. Um" + }, + { + "start": 156.63, + "duration": 0.0, + "text": "Um" + }, + { + "start": 156.64, + "duration": 0.0, + "text": "Um and<00:02:37.480> we're<00:02:37.600> going<00:02:37.720> to<00:02:37.760> ask<00:02:38.000> you<00:02:38.080> to<00:02:38.160> implement" + }, + { + "start": 158.47, + "duration": 0.0, + "text": "and we're going to ask you to implement" + }, + { + "start": 158.48, + "duration": 0.0, + "text": "and we're going to ask you to implement something<00:02:38.680> called<00:02:38.920> SwiGLU<00:02:39.480> and<00:02:39.600> not<00:02:39.840> ReLU." + }, + { + "start": 161.15, + "duration": 0.0, + "text": "something called SwiGLU and not ReLU." + }, + { + "start": 161.16, + "duration": 0.0, + "text": "something called SwiGLU and not ReLU. Right?<00:02:41.680> Why<00:02:41.880> do<00:02:42.000> we<00:02:42.160> pick<00:02:42.400> these?<00:02:43.080> Um<00:02:43.320> one" + }, + { + "start": 163.59, + "duration": 0.0, + "text": "Right? Why do we pick these? Um one" + }, + { + "start": 163.6, + "duration": 0.0, + "text": "Right? Why do we pick these? Um one reason<00:02:44.040> is<00:02:44.240> we've,<00:02:44.480> you<00:02:44.560> know,<00:02:44.720> copied<00:02:45.120> a<00:02:45.160> lot" + }, + { + "start": 165.35, + "duration": 0.0, + "text": "reason is we've, you know, copied a lot" + }, + { + "start": 165.36, + "duration": 0.0, + "text": "reason is we've, you know, copied a lot of<00:02:45.440> this<00:02:45.600> over<00:02:45.840> from<00:02:46.080> LLaMA,<00:02:46.560> but<00:02:46.880> so<00:02:47.000> did" + }, + { + "start": 167.19, + "duration": 0.0, + "text": "of this over from LLaMA, but so did" + }, + { + "start": 167.2, + "duration": 0.0, + "text": "of this over from LLaMA, but so did everyone<00:02:47.480> else.<00:02:48.240> Really,<00:02:48.600> I<00:02:48.680> think<00:02:49.160> if<00:02:49.360> you" + }, + { + "start": 169.47, + "duration": 0.0, + "text": "everyone else. Really, I think if you" + }, + { + "start": 169.48, + "duration": 0.0, + "text": "everyone else. Really, I think if you were<00:02:49.640> to<00:02:49.840> train<00:02:50.120> on<00:02:50.200> your<00:02:50.360> own<00:02:50.520> language" + }, + { + "start": 170.87, + "duration": 0.0, + "text": "were to train on your own language" + }, + { + "start": 170.88, + "duration": 0.0, + "text": "were to train on your own language model,<00:02:51.320> I<00:02:51.400> think<00:02:51.600> you'll<00:02:51.760> quickly<00:02:52.200> run<00:02:52.360> into" + }, + { + "start": 172.51, + "duration": 0.0, + "text": "model, I think you'll quickly run into" + }, + { + "start": 172.52, + "duration": 0.0, + "text": "model, I think you'll quickly run into this<00:02:52.640> question<00:02:52.920> of,<00:02:53.080> oh,<00:02:53.160> there's<00:02:53.400> so<00:02:53.520> many" + }, + { + "start": 173.75, + "duration": 0.0, + "text": "this question of, oh, there's so many" + }, + { + "start": 173.76, + "duration": 0.0, + "text": "this question of, oh, there's so many choices,<00:02:54.440> right?<00:02:54.600> Like<00:02:54.720> what<00:02:54.840> do<00:02:54.920> I<00:02:55.040> choose" + }, + { + "start": 175.63, + "duration": 0.0, + "text": "choices, right? Like what do I choose" + }, + { + "start": 175.64, + "duration": 0.0, + "text": "choices, right? Like what do I choose for<00:02:55.760> all<00:02:55.880> these<00:02:56.080> things?" + }, + { + "start": 177.11, + "duration": 0.0, + "text": "for all these things?" + }, + { + "start": 177.12, + "duration": 0.0, + "text": "for all these things? And<00:02:57.280> so<00:02:57.640> let's<00:02:57.920> now<00:02:58.600> sort<00:02:58.800> of<00:02:58.920> walk<00:02:59.160> through" + }, + { + "start": 179.51, + "duration": 0.0, + "text": "And so let's now sort of walk through" + }, + { + "start": 179.52, + "duration": 0.0, + "text": "And so let's now sort of walk through all<00:02:59.680> these<00:02:59.840> different<00:03:00.120> models." + }, + { + "start": 181.51, + "duration": 0.0, + "text": "all these different models." + }, + { + "start": 181.52, + "duration": 0.0, + "text": "all these different models. The<00:03:01.600> way<00:03:01.800> I<00:03:01.880> kind<00:03:02.080> of<00:03:02.160> think<00:03:02.320> about" + }, + { + "start": 182.51, + "duration": 0.0, + "text": "The way I kind of think about" + }, + { + "start": 182.52, + "duration": 0.0, + "text": "The way I kind of think about architectures<00:03:03.720> is<00:03:04.040> to<00:03:04.160> think<00:03:04.320> about<00:03:04.720> to<00:03:04.840> look" + }, + { + "start": 185.03, + "duration": 0.0, + "text": "architectures is to think about to look" + }, + { + "start": 185.04, + "duration": 0.0, + "text": "architectures is to think about to look at<00:03:05.200> all<00:03:05.320> the<00:03:05.440> different<00:03:05.760> things<00:03:05.960> people<00:03:06.200> have" + }, + { + "start": 186.31, + "duration": 0.0, + "text": "at all the different things people have" + }, + { + "start": 186.32, + "duration": 0.0, + "text": "at all the different things people have done<00:03:06.520> and<00:03:06.680> say,<00:03:07.400> what<00:03:07.560> are<00:03:07.680> the<00:03:07.800> things<00:03:08.040> that" + }, + { + "start": 188.15, + "duration": 0.0, + "text": "done and say, what are the things that" + }, + { + "start": 188.16, + "duration": 0.0, + "text": "done and say, what are the things that people<00:03:08.440> have<00:03:08.600> done?<00:03:09.040> Can<00:03:09.200> we<00:03:09.360> pick<00:03:09.560> and<00:03:09.680> choose" + }, + { + "start": 189.91, + "duration": 0.0, + "text": "people have done? Can we pick and choose" + }, + { + "start": 189.92, + "duration": 0.0, + "text": "people have done? Can we pick and choose from<00:03:10.080> those?" + }, + { + "start": 191.15, + "duration": 0.0, + "text": "from those?" + }, + { + "start": 191.16, + "duration": 0.0, + "text": "from those? Um<00:03:11.760> Percy<00:03:12.080> always<00:03:12.320> makes<00:03:12.560> fun<00:03:12.720> of<00:03:12.800> me<00:03:12.960> for<00:03:13.080> this" + }, + { + "start": 193.23, + "duration": 0.0, + "text": "Um Percy always makes fun of me for this" + }, + { + "start": 193.24, + "duration": 0.0, + "text": "Um Percy always makes fun of me for this a<00:03:13.320> little<00:03:13.560> bit,<00:03:13.880> but<00:03:14.080> you<00:03:14.160> know,<00:03:14.320> I<00:03:14.400> try<00:03:14.600> to" + }, + { + "start": 194.87, + "duration": 0.0, + "text": "a little bit, but you know, I try to" + }, + { + "start": 194.88, + "duration": 0.0, + "text": "a little bit, but you know, I try to look<00:03:15.080> at<00:03:15.200> all<00:03:15.320> the<00:03:15.760> the<00:03:15.840> different<00:03:16.120> models" + }, + { + "start": 196.43, + "duration": 0.0, + "text": "look at all the the different models" + }, + { + "start": 196.44, + "duration": 0.0, + "text": "look at all the the different models that<00:03:16.600> come<00:03:16.760> out<00:03:16.960> each<00:03:17.120> year<00:03:17.440> to<00:03:17.560> try<00:03:17.760> to<00:03:18.080> make" + }, + { + "start": 198.31, + "duration": 0.0, + "text": "that come out each year to try to make" + }, + { + "start": 198.32, + "duration": 0.0, + "text": "that come out each year to try to make this<00:03:18.440> lecture." + }, + { + "start": 199.55, + "duration": 0.0, + "text": "this lecture." + }, + { + "start": 199.56, + "duration": 0.0, + "text": "this lecture. Um<00:03:19.880> and<00:03:20.040> last<00:03:20.280> year<00:03:20.400> I<00:03:20.440> thought,<00:03:20.680> oh,<00:03:20.800> there's" + }, + { + "start": 200.99, + "duration": 0.0, + "text": "Um and last year I thought, oh, there's" + }, + { + "start": 201.0, + "duration": 0.0, + "text": "Um and last year I thought, oh, there's just<00:03:21.160> a<00:03:21.200> couple<00:03:21.560> papers.<00:03:21.960> It's<00:03:22.080> going<00:03:22.200> to<00:03:22.280> be" + }, + { + "start": 202.39, + "duration": 0.0, + "text": "just a couple papers. It's going to be" + }, + { + "start": 202.4, + "duration": 0.0, + "text": "just a couple papers. It's going to be fine.<00:03:22.760> It's<00:03:22.840> going<00:03:22.960> to<00:03:23.040> be<00:03:23.160> fine.<00:03:23.480> And<00:03:23.560> then<00:03:23.680> I" + }, + { + "start": 203.75, + "duration": 0.0, + "text": "fine. It's going to be fine. And then I" + }, + { + "start": 203.76, + "duration": 0.0, + "text": "fine. It's going to be fine. And then I look<00:03:23.960> through<00:03:24.120> all<00:03:24.200> the<00:03:24.280> things<00:03:24.600> and<00:03:24.680> there's" + }, + { + "start": 204.87, + "duration": 0.0, + "text": "look through all the things and there's" + }, + { + "start": 204.88, + "duration": 0.0, + "text": "look through all the things and there's a<00:03:24.920> lot<00:03:25.240> of<00:03:25.320> papers.<00:03:25.680> There's<00:03:25.880> Qwen<00:03:26.160> 2<00:03:26.440> and" + }, + { + "start": 206.55, + "duration": 0.0, + "text": "a lot of papers. There's Qwen 2 and" + }, + { + "start": 206.56, + "duration": 0.0, + "text": "a lot of papers. There's Qwen 2 and Gemma<00:03:26.800> 3" + }, + { + "start": 207.87, + "duration": 0.0, + "text": "Gemma 3" + }, + { + "start": 207.88, + "duration": 0.0, + "text": "Gemma 3 and<00:03:28.000> InternLM2." + }, + { + "start": 209.55, + "duration": 0.0, + "text": "and InternLM2." + }, + { + "start": 209.56, + "duration": 0.0, + "text": "and InternLM2. Um" + }, + { + "start": 210.19, + "duration": 0.0, + "text": "Um" + }, + { + "start": 210.2, + "duration": 0.0, + "text": "Um and<00:03:30.320> then<00:03:30.440> there<00:03:30.560> were<00:03:30.640> even<00:03:30.880> more.<00:03:31.080> There's" + }, + { + "start": 211.19, + "duration": 0.0, + "text": "and then there were even more. There's" + }, + { + "start": 211.2, + "duration": 0.0, + "text": "and then there were even more. There's like<00:03:31.320> NeMo<00:03:31.560> Tron<00:03:31.800> 4<00:03:32.120> and<00:03:32.240> Qwen<00:03:32.480> 2<00:03:32.600> and<00:03:32.720> oh<00:03:32.960> oh<00:03:33.080> my" + }, + { + "start": 213.23, + "duration": 0.0, + "text": "like NeMo Tron 4 and Qwen 2 and oh oh my" + }, + { + "start": 213.24, + "duration": 0.0, + "text": "like NeMo Tron 4 and Qwen 2 and oh oh my goodness,<00:03:33.600> there<00:03:33.760> were<00:03:34.240> 19<00:03:34.800> new<00:03:34.960> dense" + }, + { + "start": 215.23, + "duration": 0.0, + "text": "goodness, there were 19 new dense" + }, + { + "start": 215.24, + "duration": 0.0, + "text": "goodness, there were 19 new dense models.<00:03:35.680> And<00:03:35.800> so<00:03:35.960> last<00:03:36.320> year<00:03:36.680> I<00:03:36.760> had<00:03:36.920> my<00:03:37.080> work" + }, + { + "start": 217.23, + "duration": 0.0, + "text": "models. And so last year I had my work" + }, + { + "start": 217.24, + "duration": 0.0, + "text": "models. And so last year I had my work cut<00:03:37.440> out<00:03:37.560> for<00:03:37.680> me." + }, + { + "start": 218.87, + "duration": 0.0, + "text": "cut out for me." + }, + { + "start": 218.88, + "duration": 0.0, + "text": "cut out for me. And<00:03:38.960> then<00:03:39.040> this<00:03:39.240> year,<00:03:40.040> you<00:03:40.120> know,<00:03:40.240> I<00:03:40.320> thought," + }, + { + "start": 220.59, + "duration": 0.0, + "text": "And then this year, you know, I thought," + }, + { + "start": 220.6, + "duration": 0.0, + "text": "And then this year, you know, I thought, well,<00:03:40.920> there<00:03:41.080> can't<00:03:41.280> be<00:03:41.400> that<00:03:41.680> many<00:03:41.920> new<00:03:42.120> LM" + }, + { + "start": 222.43, + "duration": 0.0, + "text": "well, there can't be that many new LM" + }, + { + "start": 222.44, + "duration": 0.0, + "text": "well, there can't be that many new LM releases.<00:03:42.960> Like<00:03:43.080> it's<00:03:43.200> got<00:03:43.400> to<00:03:43.440> be<00:03:43.560> slowing" + }, + { + "start": 223.99, + "duration": 0.0, + "text": "releases. Like it's got to be slowing" + }, + { + "start": 224.0, + "duration": 0.0, + "text": "releases. Like it's got to be slowing down,<00:03:44.400> right?<00:03:44.640> Like<00:03:44.760> people<00:03:45.000> can't<00:03:45.360> keep" + }, + { + "start": 225.51, + "duration": 0.0, + "text": "down, right? Like people can't keep" + }, + { + "start": 225.52, + "duration": 0.0, + "text": "down, right? Like people can't keep training<00:03:45.880> 20<00:03:46.200> dense<00:03:46.520> LMs<00:03:46.880> per<00:03:47.040> year.<00:03:47.760> Um<00:03:47.880> and" + }, + { + "start": 227.95, + "duration": 0.0, + "text": "training 20 dense LMs per year. Um and" + }, + { + "start": 227.96, + "duration": 0.0, + "text": "training 20 dense LMs per year. Um and that's<00:03:48.120> technically<00:03:48.560> right.<00:03:48.800> There<00:03:48.880> aren't" + }, + { + "start": 228.99, + "duration": 0.0, + "text": "that's technically right. There aren't" + }, + { + "start": 229.0, + "duration": 0.0, + "text": "that's technically right. There aren't that<00:03:49.120> as<00:03:49.240> many<00:03:49.560> dense<00:03:49.840> LMs.<00:03:50.240> Initially,<00:03:50.720> you" + }, + { + "start": 230.79, + "duration": 0.0, + "text": "that as many dense LMs. Initially, you" + }, + { + "start": 230.8, + "duration": 0.0, + "text": "that as many dense LMs. Initially, you know,<00:03:50.880> I<00:03:50.920> was<00:03:51.040> like,<00:03:51.200> oh,<00:03:51.280> there's<00:03:51.520> Qwen<00:03:51.800> 3," + }, + { + "start": 232.59, + "duration": 0.0, + "text": "know, I was like, oh, there's Qwen 3," + }, + { + "start": 232.6, + "duration": 0.0, + "text": "know, I was like, oh, there's Qwen 3, Gemma<00:03:52.840> 4<00:03:53.080> just<00:03:53.280> came<00:03:53.480> out<00:03:53.640> last<00:03:53.880> Thursday,<00:03:54.240> so" + }, + { + "start": 234.35, + "duration": 0.0, + "text": "Gemma 4 just came out last Thursday, so" + }, + { + "start": 234.36, + "duration": 0.0, + "text": "Gemma 4 just came out last Thursday, so I<00:03:54.400> put<00:03:54.600> that<00:03:54.760> in<00:03:54.880> there.<00:03:55.440> And<00:03:55.560> almost<00:03:55.880> 3.<00:03:56.080> You" + }, + { + "start": 236.15, + "duration": 0.0, + "text": "I put that in there. And almost 3. You" + }, + { + "start": 236.16, + "duration": 0.0, + "text": "I put that in there. And almost 3. You know,<00:03:56.240> there's<00:03:56.440> only<00:03:56.640> a<00:03:56.680> couple.<00:03:57.480> And<00:03:57.560> of" + }, + { + "start": 237.63, + "duration": 0.0, + "text": "know, there's only a couple. And of" + }, + { + "start": 237.64, + "duration": 0.0, + "text": "know, there's only a couple. And of course<00:03:57.840> I<00:03:57.880> have<00:03:58.000> to<00:03:58.560> give<00:03:58.720> a<00:03:58.760> shout-out<00:03:59.160> to" + }, + { + "start": 239.27, + "duration": 0.0, + "text": "course I have to give a shout-out to" + }, + { + "start": 239.28, + "duration": 0.0, + "text": "course I have to give a shout-out to Percy's<00:03:59.720> own<00:03:59.920> 8B<00:04:00.280> model<00:04:00.640> trained<00:04:01.320> with" + }, + { + "start": 241.39, + "duration": 0.0, + "text": "Percy's own 8B model trained with" + }, + { + "start": 241.4, + "duration": 0.0, + "text": "Percy's own 8B model trained with Marine.<00:04:01.640> And<00:04:01.720> I<00:04:01.760> was<00:04:01.840> like,<00:04:02.000> oh,<00:04:02.160> we'll<00:04:02.320> just" + }, + { + "start": 242.51, + "duration": 0.0, + "text": "Marine. And I was like, oh, we'll just" + }, + { + "start": 242.52, + "duration": 0.0, + "text": "Marine. And I was like, oh, we'll just have<00:04:02.640> a<00:04:02.720> few<00:04:02.960> things<00:04:03.240> to<00:04:03.320> cover.<00:04:04.040> Um<00:04:04.160> but<00:04:04.280> it" + }, + { + "start": 244.39, + "duration": 0.0, + "text": "have a few things to cover. Um but it" + }, + { + "start": 244.4, + "duration": 0.0, + "text": "have a few things to cover. Um but it turns<00:04:04.680> out<00:04:04.760> if<00:04:04.840> you<00:04:04.920> start<00:04:05.160> looking,<00:04:05.480> there's" + }, + { + "start": 245.59, + "duration": 0.0, + "text": "turns out if you start looking, there's" + }, + { + "start": 245.6, + "duration": 0.0, + "text": "turns out if you start looking, there's a<00:04:05.840> lot<00:04:06.080> of<00:04:06.160> different<00:04:06.480> models.<00:04:07.440> Um<00:04:07.880> and<00:04:08.000> so<00:04:08.480> the" + }, + { + "start": 248.59, + "duration": 0.0, + "text": "a lot of different models. Um and so the" + }, + { + "start": 248.6, + "duration": 0.0, + "text": "a lot of different models. Um and so the fact<00:04:08.920> that<00:04:09.040> we<00:04:09.160> have<00:04:09.800> so<00:04:10.040> many<00:04:10.200> different" + }, + { + "start": 250.51, + "duration": 0.0, + "text": "fact that we have so many different" + }, + { + "start": 250.52, + "duration": 0.0, + "text": "fact that we have so many different models,<00:04:11.000> most<00:04:11.360> of<00:04:11.440> these<00:04:11.640> actually<00:04:12.040> are<00:04:12.160> MoEs," + }, + { + "start": 252.71, + "duration": 0.0, + "text": "models, most of these actually are MoEs," + }, + { + "start": 252.72, + "duration": 0.0, + "text": "models, most of these actually are MoEs, mixtures<00:04:13.120> of<00:04:13.200> experts,<00:04:13.680> and<00:04:13.800> I'll<00:04:13.880> be<00:04:13.960> talking" + }, + { + "start": 254.27, + "duration": 0.0, + "text": "mixtures of experts, and I'll be talking" + }, + { + "start": 254.28, + "duration": 0.0, + "text": "mixtures of experts, and I'll be talking about<00:04:14.520> that<00:04:14.720> tomorrow<00:04:15.080> rather<00:04:15.320> than<00:04:15.400> today." + }, + { + "start": 256.15, + "duration": 0.0, + "text": "about that tomorrow rather than today." + }, + { + "start": 256.16, + "duration": 0.0, + "text": "about that tomorrow rather than today. Um<00:04:16.280> because<00:04:16.560> we<00:04:16.640> have<00:04:16.760> such<00:04:17.040> a<00:04:17.120> big<00:04:17.359> diversity" + }, + { + "start": 257.87, + "duration": 0.0, + "text": "Um because we have such a big diversity" + }, + { + "start": 257.88, + "duration": 0.0, + "text": "Um because we have such a big diversity of<00:04:18.000> models,<00:04:18.359> we<00:04:18.480> actually<00:04:18.799> get<00:04:19.400> a<00:04:19.519> pretty<00:04:20.040> good" + }, + { + "start": 260.27, + "duration": 0.0, + "text": "of models, we actually get a pretty good" + }, + { + "start": 260.28, + "duration": 0.0, + "text": "of models, we actually get a pretty good picture<00:04:21.239> of<00:04:21.600> all<00:04:21.799> the<00:04:21.920> different<00:04:22.280> choices" + }, + { + "start": 262.71, + "duration": 0.0, + "text": "picture of all the different choices" + }, + { + "start": 262.72, + "duration": 0.0, + "text": "picture of all the different choices that<00:04:22.840> we<00:04:23.000> can<00:04:23.160> make.<00:04:23.560> Um" + }, + { + "start": 264.11, + "duration": 0.0, + "text": "that we can make. Um" + }, + { + "start": 264.12, + "duration": 0.0, + "text": "that we can make. Um so<00:04:24.240> I<00:04:24.280> I<00:04:24.480> made<00:04:24.760> this<00:04:24.920> like<00:04:25.080> little<00:04:25.320> table." + }, + { + "start": 265.67, + "duration": 0.0, + "text": "so I I made this like little table." + }, + { + "start": 265.68, + "duration": 0.0, + "text": "so I I made this like little table. We'll<00:04:25.800> come<00:04:26.000> back<00:04:26.320> to<00:04:26.400> this<00:04:26.560> little<00:04:26.800> table<00:04:27.080> at" + }, + { + "start": 267.15, + "duration": 0.0, + "text": "We'll come back to this little table at" + }, + { + "start": 267.16, + "duration": 0.0, + "text": "We'll come back to this little table at the<00:04:27.320> end<00:04:27.520> of<00:04:27.640> the<00:04:27.720> lecture." + }, + { + "start": 268.79, + "duration": 0.0, + "text": "the end of the lecture." + }, + { + "start": 268.8, + "duration": 0.0, + "text": "the end of the lecture. Um<00:04:29.480> but<00:04:29.640> basically<00:04:30.160> at<00:04:30.280> this<00:04:30.480> point,<00:04:30.720> you" + }, + { + "start": 270.75, + "duration": 0.0, + "text": "Um but basically at this point, you" + }, + { + "start": 270.76, + "duration": 0.0, + "text": "Um but basically at this point, you know,<00:04:30.840> starting<00:04:31.240> with,<00:04:31.840> you<00:04:31.920> know,<00:04:32.040> the" + }, + { + "start": 272.15, + "duration": 0.0, + "text": "know, starting with, you know, the" + }, + { + "start": 272.16, + "duration": 0.0, + "text": "know, starting with, you know, the original<00:04:32.640> transformer,<00:04:33.480> there's<00:04:33.720> been" + }, + { + "start": 273.83, + "duration": 0.0, + "text": "original transformer, there's been" + }, + { + "start": 273.84, + "duration": 0.0, + "text": "original transformer, there's been actually<00:04:34.200> quite<00:04:34.840> a<00:04:34.920> few<00:04:35.600> autoregressive" + }, + { + "start": 276.31, + "duration": 0.0, + "text": "actually quite a few autoregressive" + }, + { + "start": 276.32, + "duration": 0.0, + "text": "actually quite a few autoregressive language<00:04:36.720> models<00:04:37.320> kind<00:04:37.520> of<00:04:37.600> trained<00:04:38.040> on<00:04:38.120> the" + }, + { + "start": 278.23, + "duration": 0.0, + "text": "language models kind of trained on the" + }, + { + "start": 278.24, + "duration": 0.0, + "text": "language models kind of trained on the same<00:04:38.520> class<00:04:38.880> of<00:04:39.000> things." + }, + { + "start": 280.15, + "duration": 0.0, + "text": "same class of things." + }, + { + "start": 280.16, + "duration": 0.0, + "text": "same class of things. Um<00:04:40.480> and<00:04:40.600> you<00:04:40.680> can<00:04:40.840> ask<00:04:41.160> questions<00:04:41.600> like,<00:04:41.960> what" + }, + { + "start": 282.19, + "duration": 0.0, + "text": "Um and you can ask questions like, what" + }, + { + "start": 282.2, + "duration": 0.0, + "text": "Um and you can ask questions like, what are<00:04:42.280> the<00:04:42.400> different<00:04:42.760> vocabulary<00:04:43.480> sizes?<00:04:44.000> Or" + }, + { + "start": 284.15, + "duration": 0.0, + "text": "are the different vocabulary sizes? Or" + }, + { + "start": 284.16, + "duration": 0.0, + "text": "are the different vocabulary sizes? Or what<00:04:44.360> kind<00:04:44.640> of<00:04:44.800> layer<00:04:45.080> norms<00:04:45.440> do<00:04:45.560> we<00:04:45.760> use?<00:04:46.440> Or," + }, + { + "start": 287.03, + "duration": 0.0, + "text": "what kind of layer norms do we use? Or," + }, + { + "start": 287.04, + "duration": 0.0, + "text": "what kind of layer norms do we use? Or, you<00:04:47.160> know,<00:04:47.280> what<00:04:47.440> kind<00:04:47.640> of<00:04:47.720> position" + }, + { + "start": 288.11, + "duration": 0.0, + "text": "you know, what kind of position" + }, + { + "start": 288.12, + "duration": 0.0, + "text": "you know, what kind of position embeddings<00:04:48.560> do<00:04:48.680> people<00:04:49.000> use?<00:04:49.520> And<00:04:49.640> we<00:04:49.720> see" + }, + { + "start": 289.87, + "duration": 0.0, + "text": "embeddings do people use? And we see" + }, + { + "start": 289.88, + "duration": 0.0, + "text": "embeddings do people use? And we see some<00:04:50.040> fairly<00:04:50.360> clear<00:04:50.600> trends.<00:04:50.920> I'll<00:04:51.000> be" + }, + { + "start": 291.11, + "duration": 0.0, + "text": "some fairly clear trends. I'll be" + }, + { + "start": 291.12, + "duration": 0.0, + "text": "some fairly clear trends. I'll be talking<00:04:51.440> about<00:04:51.680> this<00:04:51.840> as<00:04:51.960> we<00:04:52.080> go." + }, + { + "start": 293.27, + "duration": 0.0, + "text": "talking about this as we go." + }, + { + "start": 293.28, + "duration": 0.0, + "text": "talking about this as we go. Okay." + }, + { + "start": 294.31, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 294.32, + "duration": 0.0, + "text": "Okay. Um<00:04:54.800> so" + }, + { + "start": 296.87, + "duration": 0.0, + "text": "Um so" + }, + { + "start": 296.88, + "duration": 0.0, + "text": "Um so the<00:04:57.040> goal<00:04:57.320> here<00:04:57.880> is<00:04:58.040> that<00:04:58.160> we're<00:04:58.240> going<00:04:58.360> to" + }, + { + "start": 298.39, + "duration": 0.0, + "text": "the goal here is that we're going to" + }, + { + "start": 298.4, + "duration": 0.0, + "text": "the goal here is that we're going to cover<00:04:59.280> couple<00:04:59.560> different<00:04:59.880> things.<00:05:00.160> We're" + }, + { + "start": 300.27, + "duration": 0.0, + "text": "cover couple different things. We're" + }, + { + "start": 300.28, + "duration": 0.0, + "text": "cover couple different things. We're going<00:05:00.400> to<00:05:00.440> cover<00:05:01.080> common<00:05:01.440> architecture" + }, + { + "start": 301.87, + "duration": 0.0, + "text": "going to cover common architecture" + }, + { + "start": 301.88, + "duration": 0.0, + "text": "going to cover common architecture variations.<00:05:02.720> So<00:05:02.840> these<00:05:03.000> are<00:05:03.080> different" + }, + { + "start": 303.39, + "duration": 0.0, + "text": "variations. So these are different" + }, + { + "start": 303.4, + "duration": 0.0, + "text": "variations. So these are different building<00:05:03.800> blocks<00:05:04.160> of<00:05:04.280> the<00:05:04.360> transformer." + }, + { + "start": 305.67, + "duration": 0.0, + "text": "building blocks of the transformer." + }, + { + "start": 305.68, + "duration": 0.0, + "text": "building blocks of the transformer. Um<00:05:06.000> and<00:05:06.120> after<00:05:06.360> we've<00:05:06.520> established<00:05:07.160> what<00:05:07.280> the" + }, + { + "start": 307.35, + "duration": 0.0, + "text": "Um and after we've established what the" + }, + { + "start": 307.36, + "duration": 0.0, + "text": "Um and after we've established what the standard<00:05:07.800> building<00:05:08.120> blocks<00:05:08.520> are,<00:05:09.040> like,<00:05:09.200> you" + }, + { + "start": 309.27, + "duration": 0.0, + "text": "standard building blocks are, like, you" + }, + { + "start": 309.28, + "duration": 0.0, + "text": "standard building blocks are, like, you know,<00:05:09.400> what<00:05:09.600> do<00:05:09.680> we<00:05:09.840> use<00:05:10.240> for<00:05:10.480> the<00:05:10.720> the" + }, + { + "start": 310.83, + "duration": 0.0, + "text": "know, what do we use for the the" + }, + { + "start": 310.84, + "duration": 0.0, + "text": "know, what do we use for the the nonlinearities<00:05:11.840> or<00:05:11.920> what<00:05:12.040> do<00:05:12.120> we<00:05:12.200> use<00:05:12.320> for" + }, + { + "start": 312.43, + "duration": 0.0, + "text": "nonlinearities or what do we use for" + }, + { + "start": 312.44, + "duration": 0.0, + "text": "nonlinearities or what do we use for position<00:05:12.840> embeddings,<00:05:13.880> then<00:05:14.080> we're<00:05:14.200> going<00:05:14.320> to" + }, + { + "start": 314.39, + "duration": 0.0, + "text": "position embeddings, then we're going to" + }, + { + "start": 314.4, + "duration": 0.0, + "text": "position embeddings, then we're going to talk<00:05:14.600> about<00:05:14.920> hyperparameters.<00:05:15.760> We're<00:05:15.840> going" + }, + { + "start": 315.95, + "duration": 0.0, + "text": "talk about hyperparameters. We're going" + }, + { + "start": 315.96, + "duration": 0.0, + "text": "talk about hyperparameters. We're going to<00:05:16.000> go<00:05:16.120> down<00:05:16.360> even<00:05:16.640> lower<00:05:16.920> detail<00:05:17.320> and<00:05:17.400> say" + }, + { + "start": 317.55, + "duration": 0.0, + "text": "to go down even lower detail and say" + }, + { + "start": 317.56, + "duration": 0.0, + "text": "to go down even lower detail and say like,<00:05:18.200> you<00:05:18.320> know,<00:05:18.520> what<00:05:18.720> is<00:05:18.840> FF<00:05:19.120> dim?<00:05:19.880> Um" + }, + { + "start": 320.31, + "duration": 0.0, + "text": "like, you know, what is FF dim? Um" + }, + { + "start": 320.32, + "duration": 0.0, + "text": "like, you know, what is FF dim? Um should<00:05:20.440> we<00:05:20.560> make<00:05:20.760> that<00:05:20.920> a<00:05:21.000> multiple<00:05:21.440> of<00:05:21.560> four" + }, + { + "start": 322.03, + "duration": 0.0, + "text": "should we make that a multiple of four" + }, + { + "start": 322.04, + "duration": 0.0, + "text": "should we make that a multiple of four or<00:05:22.120> like<00:05:22.360> multiply<00:05:22.920> the<00:05:23.200> the<00:05:23.320> hidden<00:05:23.640> by<00:05:23.840> four" + }, + { + "start": 324.11, + "duration": 0.0, + "text": "or like multiply the the hidden by four" + }, + { + "start": 324.12, + "duration": 0.0, + "text": "or like multiply the the hidden by four to<00:05:24.200> get<00:05:24.400> FF<00:05:24.640> dim?<00:05:25.360> How<00:05:25.520> many<00:05:25.720> vocab<00:05:26.120> elements" + }, + { + "start": 326.47, + "duration": 0.0, + "text": "to get FF dim? How many vocab elements" + }, + { + "start": 326.48, + "duration": 0.0, + "text": "to get FF dim? How many vocab elements should<00:05:26.640> I<00:05:26.720> have?<00:05:27.600> Um" + }, + { + "start": 328.19, + "duration": 0.0, + "text": "should I have? Um" + }, + { + "start": 328.2, + "duration": 0.0, + "text": "should I have? Um and<00:05:28.400> then<00:05:28.720> after<00:05:29.000> that,<00:05:29.280> we're<00:05:29.360> going<00:05:29.480> to<00:05:29.560> talk" + }, + { + "start": 329.75, + "duration": 0.0, + "text": "and then after that, we're going to talk" + }, + { + "start": 329.76, + "duration": 0.0, + "text": "and then after that, we're going to talk about<00:05:30.440> very<00:05:30.800> low-level<00:05:31.280> tricks<00:05:32.080> of<00:05:32.440> how<00:05:32.720> to" + }, + { + "start": 333.03, + "duration": 0.0, + "text": "about very low-level tricks of how to" + }, + { + "start": 333.04, + "duration": 0.0, + "text": "about very low-level tricks of how to get<00:05:33.280> models<00:05:33.680> to<00:05:33.800> train<00:05:34.120> stably.<00:05:34.720> And<00:05:34.840> the" + }, + { + "start": 334.91, + "duration": 0.0, + "text": "get models to train stably. And the" + }, + { + "start": 334.92, + "duration": 0.0, + "text": "get models to train stably. And the reason<00:05:35.240> why<00:05:35.360> I'm<00:05:35.400> going<00:05:35.520> to<00:05:35.600> talk<00:05:35.800> about<00:05:36.040> that" + }, + { + "start": 336.23, + "duration": 0.0, + "text": "reason why I'm going to talk about that" + }, + { + "start": 336.24, + "duration": 0.0, + "text": "reason why I'm going to talk about that in<00:05:36.360> this<00:05:36.600> lecture<00:05:37.360> is<00:05:37.480> because<00:05:37.760> these" + }, + { + "start": 337.99, + "duration": 0.0, + "text": "in this lecture is because these" + }, + { + "start": 338.0, + "duration": 0.0, + "text": "in this lecture is because these stability<00:05:38.520> tricks<00:05:39.200> have<00:05:39.400> a<00:05:39.480> pretty<00:05:40.000> close" + }, + { + "start": 340.39, + "duration": 0.0, + "text": "stability tricks have a pretty close" + }, + { + "start": 340.4, + "duration": 0.0, + "text": "stability tricks have a pretty close connection<00:05:40.880> with<00:05:41.040> the<00:05:41.240> architecture" + }, + { + "start": 341.79, + "duration": 0.0, + "text": "connection with the architecture" + }, + { + "start": 341.8, + "duration": 0.0, + "text": "connection with the architecture variation,<00:05:42.720> right?<00:05:43.480> Um<00:05:43.760> one<00:05:43.920> of<00:05:44.000> the<00:05:44.080> things" + }, + { + "start": 344.31, + "duration": 0.0, + "text": "variation, right? Um one of the things" + }, + { + "start": 344.32, + "duration": 0.0, + "text": "variation, right? Um one of the things that,<00:05:44.640> you<00:05:44.720> know,<00:05:44.840> higher<00:05:45.120> level<00:05:45.440> I<00:05:45.480> want<00:05:45.680> to" + }, + { + "start": 345.75, + "duration": 0.0, + "text": "that, you know, higher level I want to" + }, + { + "start": 345.76, + "duration": 0.0, + "text": "that, you know, higher level I want to sort<00:05:45.880> of<00:05:45.960> impress<00:05:46.320> upon<00:05:46.680> you<00:05:47.200> is<00:05:47.360> that" + }, + { + "start": 347.51, + "duration": 0.0, + "text": "sort of impress upon you is that" + }, + { + "start": 347.52, + "duration": 0.0, + "text": "sort of impress upon you is that architectures<00:05:48.200> are<00:05:48.320> actually<00:05:48.840> a<00:05:49.160> a<00:05:49.200> very" + }, + { + "start": 349.67, + "duration": 0.0, + "text": "architectures are actually a a very" + }, + { + "start": 349.68, + "duration": 0.0, + "text": "architectures are actually a a very complex<00:05:50.320> set<00:05:50.480> of<00:05:50.600> tradeoffs,<00:05:51.280> right?<00:05:51.640> Like" + }, + { + "start": 351.79, + "duration": 0.0, + "text": "complex set of tradeoffs, right? Like" + }, + { + "start": 351.8, + "duration": 0.0, + "text": "complex set of tradeoffs, right? Like what<00:05:51.920> does<00:05:52.040> a<00:05:52.120> architecture<00:05:52.680> have<00:05:52.880> to<00:05:53.040> do?" + }, + { + "start": 353.83, + "duration": 0.0, + "text": "what does a architecture have to do?" + }, + { + "start": 353.84, + "duration": 0.0, + "text": "what does a architecture have to do? Well,<00:05:54.200> it<00:05:54.320> has<00:05:54.520> to<00:05:54.640> learn<00:05:54.880> from<00:05:55.080> data,<00:05:55.320> so<00:05:55.400> it" + }, + { + "start": 355.43, + "duration": 0.0, + "text": "Well, it has to learn from data, so it" + }, + { + "start": 355.44, + "duration": 0.0, + "text": "Well, it has to learn from data, so it has<00:05:55.520> to<00:05:55.640> generalize.<00:05:56.600> It<00:05:56.680> has<00:05:56.840> to<00:05:56.920> train" + }, + { + "start": 357.27, + "duration": 0.0, + "text": "has to generalize. It has to train" + }, + { + "start": 357.28, + "duration": 0.0, + "text": "has to generalize. It has to train efficiently<00:05:57.760> on<00:05:57.880> GPUs.<00:05:58.920> And<00:05:59.400> it<00:05:59.480> has<00:05:59.680> to<00:05:59.760> not" + }, + { + "start": 360.27, + "duration": 0.0, + "text": "efficiently on GPUs. And it has to not" + }, + { + "start": 360.28, + "duration": 0.0, + "text": "efficiently on GPUs. And it has to not blow<00:06:00.360> up,<00:06:00.520> right?<00:06:00.680> Like<00:06:00.840> halfway<00:06:01.160> through" + }, + { + "start": 361.35, + "duration": 0.0, + "text": "blow up, right? Like halfway through" + }, + { + "start": 361.36, + "duration": 0.0, + "text": "blow up, right? Like halfway through training,<00:06:02.080> if<00:06:02.240> your,<00:06:02.720> you<00:06:02.800> know,<00:06:02.920> training" + }, + { + "start": 363.83, + "duration": 0.0, + "text": "training, if your, you know, training" + }, + { + "start": 363.84, + "duration": 0.0, + "text": "training, if your, you know, training losses<00:06:04.280> just<00:06:04.440> go<00:06:04.520> like<00:06:04.760> down<00:06:04.960> like<00:06:05.120> this<00:06:05.320> and" + }, + { + "start": 365.39, + "duration": 0.0, + "text": "losses just go like down like this and" + }, + { + "start": 365.4, + "duration": 0.0, + "text": "losses just go like down like this and then<00:06:05.520> suddenly<00:06:05.840> blow<00:06:06.080> up,<00:06:06.560> that's<00:06:06.720> no<00:06:06.880> good<00:06:07.040> at" + }, + { + "start": 367.15, + "duration": 0.0, + "text": "then suddenly blow up, that's no good at" + }, + { + "start": 367.16, + "duration": 0.0, + "text": "then suddenly blow up, that's no good at all,<00:06:07.360> right?<00:06:07.840> So<00:06:08.040> all<00:06:08.280> these<00:06:08.480> different" + }, + { + "start": 368.79, + "duration": 0.0, + "text": "all, right? So all these different" + }, + { + "start": 368.8, + "duration": 0.0, + "text": "all, right? So all these different requirements<00:06:09.800> end<00:06:09.960> up<00:06:10.080> getting<00:06:10.360> baked" + }, + { + "start": 370.87, + "duration": 0.0, + "text": "requirements end up getting baked" + }, + { + "start": 370.88, + "duration": 0.0, + "text": "requirements end up getting baked straight<00:06:11.320> into<00:06:11.480> the<00:06:11.600> architecture.<00:06:12.120> And" + }, + { + "start": 372.23, + "duration": 0.0, + "text": "straight into the architecture. And" + }, + { + "start": 372.24, + "duration": 0.0, + "text": "straight into the architecture. And that's<00:06:12.480> why<00:06:12.640> these<00:06:12.800> things<00:06:13.000> are<00:06:13.280> a<00:06:13.320> little<00:06:13.560> bit" + }, + { + "start": 373.71, + "duration": 0.0, + "text": "that's why these things are a little bit" + }, + { + "start": 373.72, + "duration": 0.0, + "text": "that's why these things are a little bit messy<00:06:14.120> and<00:06:14.240> a<00:06:14.280> little<00:06:14.480> bit<00:06:14.640> complex." + }, + { + "start": 375.95, + "duration": 0.0, + "text": "messy and a little bit complex." + }, + { + "start": 375.96, + "duration": 0.0, + "text": "messy and a little bit complex. Um<00:06:16.200> but<00:06:16.360> you<00:06:16.440> should<00:06:16.720> keep<00:06:16.920> that<00:06:17.080> in<00:06:17.160> mind<00:06:17.480> and" + }, + { + "start": 377.55, + "duration": 0.0, + "text": "Um but you should keep that in mind and" + }, + { + "start": 377.56, + "duration": 0.0, + "text": "Um but you should keep that in mind and that's<00:06:17.800> why,<00:06:18.000> you<00:06:18.160> know,<00:06:18.240> things<00:06:18.440> aren't<00:06:18.720> in" + }, + { + "start": 378.83, + "duration": 0.0, + "text": "that's why, you know, things aren't in" + }, + { + "start": 378.84, + "duration": 0.0, + "text": "that's why, you know, things aren't in many<00:06:19.000> ways<00:06:19.200> not<00:06:19.400> so<00:06:19.520> elegant." + }, + { + "start": 381.03, + "duration": 0.0, + "text": "many ways not so elegant." + }, + { + "start": 381.04, + "duration": 0.0, + "text": "many ways not so elegant. Okay." + }, + { + "start": 382.27, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 382.28, + "duration": 0.0, + "text": "Okay. So<00:06:22.600> we're<00:06:22.760> going<00:06:22.880> to<00:06:22.960> start<00:06:23.640> with<00:06:24.120> the<00:06:24.240> core" + }, + { + "start": 384.55, + "duration": 0.0, + "text": "So we're going to start with the core" + }, + { + "start": 384.56, + "duration": 0.0, + "text": "So we're going to start with the core architecture<00:06:25.120> piece.<00:06:25.920> And<00:06:26.040> as<00:06:26.160> a<00:06:26.280> high-level" + }, + { + "start": 386.95, + "duration": 0.0, + "text": "architecture piece. And as a high-level" + }, + { + "start": 386.96, + "duration": 0.0, + "text": "architecture piece. And as a high-level view,<00:06:27.800> you<00:06:27.920> know,<00:06:28.280> I<00:06:28.360> think<00:06:28.600> the<00:06:28.960> the<00:06:29.080> way<00:06:29.280> that" + }, + { + "start": 389.43, + "duration": 0.0, + "text": "view, you know, I think the the way that" + }, + { + "start": 389.44, + "duration": 0.0, + "text": "view, you know, I think the the way that I<00:06:29.520> see<00:06:29.720> a<00:06:29.760> lot<00:06:30.000> of<00:06:30.080> the<00:06:30.160> architecture<00:06:30.640> stuff," + }, + { + "start": 390.99, + "duration": 0.0, + "text": "I see a lot of the architecture stuff," + }, + { + "start": 391.0, + "duration": 0.0, + "text": "I see a lot of the architecture stuff, you<00:06:31.080> know,<00:06:31.160> looking<00:06:31.840> basically" + }, + { + "start": 392.15, + "duration": 0.0, + "text": "you know, looking basically" + }, + { + "start": 392.16, + "duration": 0.0, + "text": "you know, looking basically historically,<00:06:33.320> is<00:06:33.880> kind<00:06:34.120> of<00:06:34.240> in<00:06:34.360> the<00:06:34.520> early" + }, + { + "start": 394.83, + "duration": 0.0, + "text": "historically, is kind of in the early" + }, + { + "start": 394.84, + "duration": 0.0, + "text": "historically, is kind of in the early days<00:06:35.200> of,<00:06:35.360> you<00:06:35.440> know,<00:06:35.560> starting<00:06:35.920> with<00:06:36.040> the" + }, + { + "start": 396.11, + "duration": 0.0, + "text": "days of, you know, starting with the" + }, + { + "start": 396.12, + "duration": 0.0, + "text": "days of, you know, starting with the transformer<00:06:36.880> until,<00:06:37.240> you<00:06:37.320> know,<00:06:37.480> GPT-3<00:06:38.520> or" + }, + { + "start": 398.63, + "duration": 0.0, + "text": "transformer until, you know, GPT-3 or" + }, + { + "start": 398.64, + "duration": 0.0, + "text": "transformer until, you know, GPT-3 or so,<00:06:39.160> there's<00:06:39.320> a<00:06:39.400> lot<00:06:39.600> of<00:06:39.680> experimentation" + }, + { + "start": 400.47, + "duration": 0.0, + "text": "so, there's a lot of experimentation" + }, + { + "start": 400.48, + "duration": 0.0, + "text": "so, there's a lot of experimentation that<00:06:40.600> happens.<00:06:40.960> People<00:06:41.200> try<00:06:41.400> lots<00:06:41.680> of" + }, + { + "start": 401.79, + "duration": 0.0, + "text": "that happens. People try lots of" + }, + { + "start": 401.8, + "duration": 0.0, + "text": "that happens. People try lots of different<00:06:42.160> things.<00:06:42.760> There's<00:06:42.960> no<00:06:43.120> like<00:06:43.360> gold" + }, + { + "start": 403.67, + "duration": 0.0, + "text": "different things. There's no like gold" + }, + { + "start": 403.68, + "duration": 0.0, + "text": "different things. There's no like gold standard<00:06:44.160> that<00:06:44.280> everyone<00:06:44.560> has<00:06:44.720> unified<00:06:45.200> on." + }, + { + "start": 405.87, + "duration": 0.0, + "text": "standard that everyone has unified on." + }, + { + "start": 405.88, + "duration": 0.0, + "text": "standard that everyone has unified on. And<00:06:46.000> then,<00:06:46.280> you<00:06:46.320> know,<00:06:46.440> LLaMA<00:06:46.720> 2<00:06:47.000> comes<00:06:47.320> out" + }, + { + "start": 407.79, + "duration": 0.0, + "text": "And then, you know, LLaMA 2 comes out" + }, + { + "start": 407.8, + "duration": 0.0, + "text": "And then, you know, LLaMA 2 comes out and<00:06:47.880> everyone's<00:06:48.160> like,<00:06:48.360> wow,<00:06:48.600> LLaMA<00:06:48.840> 2<00:06:48.960> is" + }, + { + "start": 409.03, + "duration": 0.0, + "text": "and everyone's like, wow, LLaMA 2 is" + }, + { + "start": 409.04, + "duration": 0.0, + "text": "and everyone's like, wow, LLaMA 2 is great.<00:06:49.360> I<00:06:49.440> want<00:06:49.880> my<00:06:50.080> own<00:06:50.280> LLaMA<00:06:50.560> 2.<00:06:51.200> And<00:06:51.360> so" + }, + { + "start": 411.59, + "duration": 0.0, + "text": "great. I want my own LLaMA 2. And so" + }, + { + "start": 411.6, + "duration": 0.0, + "text": "great. I want my own LLaMA 2. And so everyone<00:06:51.960> starts<00:06:52.280> training<00:06:52.640> LLaMA<00:06:52.920> 2-alikes" + }, + { + "start": 413.71, + "duration": 0.0, + "text": "everyone starts training LLaMA 2-alikes" + }, + { + "start": 413.72, + "duration": 0.0, + "text": "everyone starts training LLaMA 2-alikes with,<00:06:54.400> you<00:06:54.480> know,<00:06:54.600> minor<00:06:54.960> variation" + }, + { + "start": 416.47, + "duration": 0.0, + "text": "with, you know, minor variation" + }, + { + "start": 416.48, + "duration": 0.0, + "text": "with, you know, minor variation that<00:06:56.640> people<00:06:56.960> have.<00:06:57.720> And<00:06:57.840> then<00:06:58.000> finally,<00:06:58.840> you" + }, + { + "start": 418.95, + "duration": 0.0, + "text": "that people have. And then finally, you" + }, + { + "start": 418.96, + "duration": 0.0, + "text": "that people have. And then finally, you know,<00:06:59.160> last<00:06:59.680> year<00:07:00.040> we<00:07:00.200> saw<00:07:00.440> really<00:07:00.760> big" + }, + { + "start": 420.95, + "duration": 0.0, + "text": "know, last year we saw really big" + }, + { + "start": 420.96, + "duration": 0.0, + "text": "know, last year we saw really big differences<00:07:01.480> or<00:07:01.680> or<00:07:01.760> sort<00:07:01.880> of<00:07:01.960> a<00:07:02.000> trend" + }, + { + "start": 422.99, + "duration": 0.0, + "text": "differences or or sort of a trend" + }, + { + "start": 423.0, + "duration": 0.0, + "text": "differences or or sort of a trend towards<00:07:03.760> architecture<00:07:04.360> modifications<00:07:05.040> that" + }, + { + "start": 425.19, + "duration": 0.0, + "text": "towards architecture modifications that" + }, + { + "start": 425.2, + "duration": 0.0, + "text": "towards architecture modifications that make<00:07:05.400> training<00:07:05.720> more<00:07:05.880> stable.<00:07:06.800> And<00:07:07.000> this<00:07:07.240> year" + }, + { + "start": 427.79, + "duration": 0.0, + "text": "make training more stable. And this year" + }, + { + "start": 427.8, + "duration": 0.0, + "text": "make training more stable. And this year we<00:07:08.000> see<00:07:08.400> lots<00:07:08.720> of<00:07:08.800> trends<00:07:09.240> towards" + }, + { + "start": 430.11, + "duration": 0.0, + "text": "we see lots of trends towards" + }, + { + "start": 430.12, + "duration": 0.0, + "text": "we see lots of trends towards architecture<00:07:10.640> variations<00:07:11.160> that<00:07:11.320> enable" + }, + { + "start": 432.03, + "duration": 0.0, + "text": "architecture variations that enable" + }, + { + "start": 432.04, + "duration": 0.0, + "text": "architecture variations that enable longer<00:07:12.440> context<00:07:12.920> dependence.<00:07:13.960> So<00:07:14.440> there<00:07:14.560> are" + }, + { + "start": 434.63, + "duration": 0.0, + "text": "longer context dependence. So there are" + }, + { + "start": 434.64, + "duration": 0.0, + "text": "longer context dependence. So there are these<00:07:14.840> big<00:07:15.080> themes<00:07:15.440> that<00:07:15.560> are<00:07:15.640> happening,<00:07:16.080> but" + }, + { + "start": 436.23, + "duration": 0.0, + "text": "these big themes that are happening, but" + }, + { + "start": 436.24, + "duration": 0.0, + "text": "these big themes that are happening, but really<00:07:16.520> I<00:07:16.600> think,<00:07:17.200> you<00:07:17.280> know,<00:07:17.400> you<00:07:17.560> see<00:07:17.720> this" + }, + { + "start": 437.87, + "duration": 0.0, + "text": "really I think, you know, you see this" + }, + { + "start": 437.88, + "duration": 0.0, + "text": "really I think, you know, you see this like<00:07:18.080> big<00:07:18.320> point<00:07:18.680> when<00:07:19.000> LLaMA<00:07:19.240> 2<00:07:19.440> comes<00:07:19.720> out" + }, + { + "start": 439.83, + "duration": 0.0, + "text": "like big point when LLaMA 2 comes out" + }, + { + "start": 439.84, + "duration": 0.0, + "text": "like big point when LLaMA 2 comes out and<00:07:19.960> everyone's<00:07:20.200> like,<00:07:20.360> wow,<00:07:20.560> I<00:07:20.600> want<00:07:20.720> to" + }, + { + "start": 440.79, + "duration": 0.0, + "text": "and everyone's like, wow, I want to" + }, + { + "start": 440.8, + "duration": 0.0, + "text": "and everyone's like, wow, I want to train<00:07:21.000> something<00:07:21.160> with<00:07:21.240> that.<00:07:21.760> And<00:07:21.880> then" + }, + { + "start": 441.99, + "duration": 0.0, + "text": "train something with that. And then" + }, + { + "start": 442.0, + "duration": 0.0, + "text": "train something with that. And then suddenly,<00:07:22.680> or<00:07:22.800> not<00:07:22.960> suddenly,<00:07:23.320> but<00:07:23.520> after" + }, + { + "start": 443.83, + "duration": 0.0, + "text": "suddenly, or not suddenly, but after" + }, + { + "start": 443.84, + "duration": 0.0, + "text": "suddenly, or not suddenly, but after that,<00:07:24.160> people<00:07:24.400> are<00:07:24.440> starting<00:07:24.720> to<00:07:24.760> explore" + }, + { + "start": 445.11, + "duration": 0.0, + "text": "that, people are starting to explore" + }, + { + "start": 445.12, + "duration": 0.0, + "text": "that, people are starting to explore once<00:07:25.280> again.<00:07:25.520> So<00:07:25.560> it's<00:07:25.680> kind<00:07:25.800> of<00:07:25.920> cool<00:07:26.120> to<00:07:26.240> see" + }, + { + "start": 447.11, + "duration": 0.0, + "text": "once again. So it's kind of cool to see" + }, + { + "start": 447.12, + "duration": 0.0, + "text": "once again. So it's kind of cool to see all<00:07:27.320> these<00:07:27.480> different<00:07:27.800> changes." + }, + { + "start": 449.63, + "duration": 0.0, + "text": "all these different changes." + }, + { + "start": 449.64, + "duration": 0.0, + "text": "all these different changes. Okay." + }, + { + "start": 450.55, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 450.56, + "duration": 0.0, + "text": "Okay. Um<00:07:31.160> I<00:07:31.240> think<00:07:32.000> people<00:07:32.320> can<00:07:32.480> disagree<00:07:32.880> about<00:07:33.080> a" + }, + { + "start": 453.15, + "duration": 0.0, + "text": "Um I think people can disagree about a" + }, + { + "start": 453.16, + "duration": 0.0, + "text": "Um I think people can disagree about a lot<00:07:33.520> of<00:07:33.600> things<00:07:33.880> on<00:07:34.040> architectures,<00:07:35.000> but" + }, + { + "start": 455.15, + "duration": 0.0, + "text": "lot of things on architectures, but" + }, + { + "start": 455.16, + "duration": 0.0, + "text": "lot of things on architectures, but there<00:07:35.400> is<00:07:35.720> one<00:07:36.040> thing<00:07:36.360> that<00:07:36.560> everyone<00:07:36.840> agrees" + }, + { + "start": 457.23, + "duration": 0.0, + "text": "there is one thing that everyone agrees" + }, + { + "start": 457.24, + "duration": 0.0, + "text": "there is one thing that everyone agrees on,<00:07:37.640> you<00:07:37.720> know,<00:07:37.960> like<00:07:38.160> if<00:07:38.240> you<00:07:38.360> take<00:07:38.600> the" + }, + { + "start": 458.83, + "duration": 0.0, + "text": "on, you know, like if you take the" + }, + { + "start": 458.84, + "duration": 0.0, + "text": "on, you know, like if you take the transformer<00:07:39.400> paper,<00:07:40.320> I<00:07:40.360> think<00:07:40.520> a<00:07:40.600> lot<00:07:40.760> of" + }, + { + "start": 460.87, + "duration": 0.0, + "text": "transformer paper, I think a lot of" + }, + { + "start": 460.88, + "duration": 0.0, + "text": "transformer paper, I think a lot of people<00:07:41.200> will<00:07:41.280> say<00:07:41.440> like<00:07:41.600> the<00:07:41.680> transformer" + }, + { + "start": 462.23, + "duration": 0.0, + "text": "people will say like the transformer" + }, + { + "start": 462.24, + "duration": 0.0, + "text": "people will say like the transformer people<00:07:42.720> got<00:07:43.040> like<00:07:43.280> most<00:07:43.640> of<00:07:43.720> the<00:07:43.800> things" + }, + { + "start": 464.07, + "duration": 0.0, + "text": "people got like most of the things" + }, + { + "start": 464.08, + "duration": 0.0, + "text": "people got like most of the things right,<00:07:44.600> except<00:07:45.320> this.<00:07:45.920> And<00:07:46.040> the<00:07:46.120> thing<00:07:46.320> that" + }, + { + "start": 466.47, + "duration": 0.0, + "text": "right, except this. And the thing that" + }, + { + "start": 466.48, + "duration": 0.0, + "text": "right, except this. And the thing that they<00:07:46.640> really<00:07:47.000> did<00:07:47.160> not<00:07:47.400> get<00:07:47.560> right<00:07:47.840> or<00:07:47.960> like<00:07:48.320> I" + }, + { + "start": 468.35, + "duration": 0.0, + "text": "they really did not get right or like I" + }, + { + "start": 468.36, + "duration": 0.0, + "text": "they really did not get right or like I think<00:07:48.600> most<00:07:48.800> people<00:07:49.080> agree<00:07:49.640> they<00:07:49.760> did<00:07:49.920> not<00:07:50.120> get" + }, + { + "start": 470.31, + "duration": 0.0, + "text": "think most people agree they did not get" + }, + { + "start": 470.32, + "duration": 0.0, + "text": "think most people agree they did not get right<00:07:50.960> is<00:07:51.320> where<00:07:51.600> you<00:07:51.760> put<00:07:52.000> the<00:07:52.120> layer<00:07:52.480> norm," + }, + { + "start": 472.87, + "duration": 0.0, + "text": "right is where you put the layer norm," + }, + { + "start": 472.88, + "duration": 0.0, + "text": "right is where you put the layer norm, right?<00:07:53.160> So<00:07:53.760> in<00:07:53.880> the<00:07:54.000> original<00:07:55.080> um" + }, + { + "start": 475.91, + "duration": 0.0, + "text": "right? So in the original um" + }, + { + "start": 475.92, + "duration": 0.0, + "text": "right? So in the original um uh<00:07:56.000> transformer<00:07:56.600> paper,<00:07:57.360> you<00:07:57.440> know,<00:07:57.600> the" + }, + { + "start": 477.71, + "duration": 0.0, + "text": "uh transformer paper, you know, the" + }, + { + "start": 477.72, + "duration": 0.0, + "text": "uh transformer paper, you know, the layer<00:07:58.000> norm<00:07:58.480> goes<00:07:58.760> in<00:07:58.960> what<00:07:59.120> you<00:07:59.240> would<00:07:59.400> call" + }, + { + "start": 479.55, + "duration": 0.0, + "text": "layer norm goes in what you would call" + }, + { + "start": 479.56, + "duration": 0.0, + "text": "layer norm goes in what you would call the<00:07:59.680> residual<00:08:00.320> path,<00:08:00.920> right?<00:08:01.080> So<00:08:01.160> in<00:08:01.240> the" + }, + { + "start": 481.31, + "duration": 0.0, + "text": "the residual path, right? So in the" + }, + { + "start": 481.32, + "duration": 0.0, + "text": "the residual path, right? So in the transformer,<00:08:02.240> you<00:08:02.320> know,<00:08:02.440> you<00:08:02.520> have<00:08:02.680> the" + }, + { + "start": 482.75, + "duration": 0.0, + "text": "transformer, you know, you have the" + }, + { + "start": 482.76, + "duration": 0.0, + "text": "transformer, you know, you have the residual<00:08:03.200> stream,<00:08:03.520> this<00:08:03.720> X<00:08:04.040> that<00:08:04.200> kind<00:08:04.320> of" + }, + { + "start": 484.51, + "duration": 0.0, + "text": "residual stream, this X that kind of" + }, + { + "start": 484.52, + "duration": 0.0, + "text": "residual stream, this X that kind of runs<00:08:04.880> through<00:08:05.040> the<00:08:05.160> whole<00:08:05.360> network.<00:08:06.240> And" + }, + { + "start": 491.76, + "duration": 0.0, + "text": "sort<00:08:11.880> of<00:08:11.960> a<00:08:12.040> delta<00:08:12.800> back<00:08:13.040> into<00:08:13.240> this<00:08:13.880> residual" + }, + { + "start": 494.47, + "duration": 0.0, + "text": "sort of a delta back into this residual" + }, + { + "start": 494.48, + "duration": 0.0, + "text": "sort of a delta back into this residual stream." + }, + { + "start": 495.59, + "duration": 0.0, + "text": "stream." + }, + { + "start": 495.6, + "duration": 0.0, + "text": "stream. And<00:08:15.800> then<00:08:16.160> in<00:08:16.280> order<00:08:16.480> to<00:08:16.680> make<00:08:16.920> sure<00:08:17.280> that" + }, + { + "start": 497.43, + "duration": 0.0, + "text": "And then in order to make sure that" + }, + { + "start": 497.44, + "duration": 0.0, + "text": "And then in order to make sure that these<00:08:17.760> gradients<00:08:18.280> are<00:08:18.400> sort<00:08:18.520> of<00:08:18.640> stable" + }, + { + "start": 499.23, + "duration": 0.0, + "text": "these gradients are sort of stable" + }, + { + "start": 499.24, + "duration": 0.0, + "text": "these gradients are sort of stable across<00:08:19.680> layers,<00:08:20.480> you<00:08:20.560> know,<00:08:20.680> a<00:08:20.760> layer<00:08:21.080> norm<00:08:21.320> is" + }, + { + "start": 501.43, + "duration": 0.0, + "text": "across layers, you know, a layer norm is" + }, + { + "start": 501.44, + "duration": 0.0, + "text": "across layers, you know, a layer norm is placed<00:08:21.840> at<00:08:21.960> the<00:08:22.080> end<00:08:22.600> of<00:08:22.760> each<00:08:22.920> of<00:08:23.040> these" + }, + { + "start": 503.27, + "duration": 0.0, + "text": "placed at the end of each of these" + }, + { + "start": 503.28, + "duration": 0.0, + "text": "placed at the end of each of these components." + }, + { + "start": 504.91, + "duration": 0.0, + "text": "components." + }, + { + "start": 504.92, + "duration": 0.0, + "text": "components. Now,<00:08:25.880> um" + }, + { + "start": 506.63, + "duration": 0.0, + "text": "Now, um" + }, + { + "start": 506.64, + "duration": 0.0, + "text": "Now, um instead<00:08:27.280> of<00:08:27.400> putting<00:08:27.640> the<00:08:27.760> layer<00:08:28.000> norms<00:08:28.360> in" + }, + { + "start": 508.59, + "duration": 0.0, + "text": "instead of putting the layer norms in" + }, + { + "start": 508.6, + "duration": 0.0, + "text": "instead of putting the layer norms in the<00:08:28.720> residual<00:08:29.200> stream,<00:08:29.560> there's<00:08:29.760> an" + }, + { + "start": 509.87, + "duration": 0.0, + "text": "the residual stream, there's an" + }, + { + "start": 509.88, + "duration": 0.0, + "text": "the residual stream, there's an alternative.<00:08:30.880> Um<00:08:31.120> I'll<00:08:31.320> refer<00:08:31.600> to<00:08:31.720> this<00:08:31.920> as" + }, + { + "start": 512.11, + "duration": 0.0, + "text": "alternative. Um I'll refer to this as" + }, + { + "start": 512.12, + "duration": 0.0, + "text": "alternative. Um I'll refer to this as pre-norm," + }, + { + "start": 513.63, + "duration": 0.0, + "text": "pre-norm," + }, + { + "start": 513.64, + "duration": 0.0, + "text": "pre-norm, um<00:08:34.000> in<00:08:34.159> which<00:08:34.680> you<00:08:34.760> can<00:08:34.960> put<00:08:35.159> the<00:08:35.280> layer<00:08:35.560> norm" + }, + { + "start": 516.149, + "duration": 0.0, + "text": "um in which you can put the layer norm" + }, + { + "start": 516.159, + "duration": 0.0, + "text": "um in which you can put the layer norm outside<00:08:36.800> of<00:08:36.880> the<00:08:36.960> residual<00:08:37.440> stream,<00:08:38.039> but" + }, + { + "start": 518.23, + "duration": 0.0, + "text": "outside of the residual stream, but" + }, + { + "start": 518.24, + "duration": 0.0, + "text": "outside of the residual stream, but before<00:08:38.919> each<00:08:39.080> of<00:08:39.159> the<00:08:39.280> computations.<00:08:39.960> So<00:08:40.039> you" + }, + { + "start": 520.11, + "duration": 0.0, + "text": "before each of the computations. So you" + }, + { + "start": 520.12, + "duration": 0.0, + "text": "before each of the computations. So you can<00:08:40.240> put<00:08:40.360> it<00:08:40.440> before<00:08:40.800> the<00:08:40.880> multi-head" + }, + { + "start": 521.31, + "duration": 0.0, + "text": "can put it before the multi-head" + }, + { + "start": 521.32, + "duration": 0.0, + "text": "can put it before the multi-head attention,<00:08:41.680> you<00:08:41.719> can<00:08:41.880> put<00:08:42.080> it<00:08:42.159> before<00:08:42.560> the" + }, + { + "start": 522.67, + "duration": 0.0, + "text": "attention, you can put it before the" + }, + { + "start": 522.68, + "duration": 0.0, + "text": "attention, you can put it before the FFN,<00:08:43.640> right?<00:08:44.200> Um<00:08:44.400> we'll<00:08:44.520> call<00:08:44.680> this<00:08:44.880> pre-norm." + }, + { + "start": 525.59, + "duration": 0.0, + "text": "FFN, right? Um we'll call this pre-norm." + }, + { + "start": 525.6, + "duration": 0.0, + "text": "FFN, right? Um we'll call this pre-norm. Um" + }, + { + "start": 526.27, + "duration": 0.0, + "text": "Um" + }, + { + "start": 526.28, + "duration": 0.0, + "text": "Um the<00:08:46.440> nomenclature<00:08:47.000> will<00:08:47.120> get<00:08:47.240> a<00:08:47.280> little<00:08:47.520> bit" + }, + { + "start": 527.67, + "duration": 0.0, + "text": "the nomenclature will get a little bit" + }, + { + "start": 527.68, + "duration": 0.0, + "text": "the nomenclature will get a little bit confusing.<00:08:48.680> Um<00:08:49.360> You<00:08:49.440> can<00:08:49.600> call<00:08:49.800> this<00:08:50.280> uh" + }, + { + "start": 530.39, + "duration": 0.0, + "text": "confusing. Um You can call this uh" + }, + { + "start": 530.4, + "duration": 0.0, + "text": "confusing. Um You can call this uh post-norm<00:08:51.040> for<00:08:51.200> now,<00:08:51.480> but<00:08:51.640> let's<00:08:51.800> call<00:08:52.000> this" + }, + { + "start": 532.15, + "duration": 0.0, + "text": "post-norm for now, but let's call this" + }, + { + "start": 532.16, + "duration": 0.0, + "text": "post-norm for now, but let's call this sort<00:08:52.280> of<00:08:52.400> residual<00:08:52.960> norm,<00:08:53.400> right?<00:08:53.600> Cuz<00:08:53.720> you're" + }, + { + "start": 533.83, + "duration": 0.0, + "text": "sort of residual norm, right? Cuz you're" + }, + { + "start": 533.84, + "duration": 0.0, + "text": "sort of residual norm, right? Cuz you're putting<00:08:54.040> the<00:08:54.160> norm<00:08:54.360> in<00:08:54.440> the<00:08:54.520> residual<00:08:55.040> layer." + }, + { + "start": 536.23, + "duration": 0.0, + "text": "putting the norm in the residual layer." + }, + { + "start": 536.24, + "duration": 0.0, + "text": "putting the norm in the residual layer. Um<00:08:56.720> basically<00:08:57.320> all<00:08:57.760> modern<00:08:58.240> language<00:08:58.600> models" + }, + { + "start": 539.83, + "duration": 0.0, + "text": "Um basically all modern language models" + }, + { + "start": 539.84, + "duration": 0.0, + "text": "Um basically all modern language models uh<00:09:00.280> push<00:09:00.560> the<00:09:00.680> layer<00:09:00.920> norm<00:09:01.440> outside<00:09:01.960> of<00:09:02.040> the" + }, + { + "start": 542.11, + "duration": 0.0, + "text": "uh push the layer norm outside of the" + }, + { + "start": 542.12, + "duration": 0.0, + "text": "uh push the layer norm outside of the residual<00:09:02.680> stream.<00:09:03.000> This<00:09:03.160> is<00:09:03.240> just<00:09:03.400> like<00:09:03.560> a" + }, + { + "start": 543.59, + "duration": 0.0, + "text": "residual stream. This is just like a" + }, + { + "start": 543.6, + "duration": 0.0, + "text": "residual stream. This is just like a thing<00:09:03.880> that<00:09:04.440> basically<00:09:05.000> everybody<00:09:05.520> does." + }, + { + "start": 546.55, + "duration": 0.0, + "text": "thing that basically everybody does." + }, + { + "start": 546.56, + "duration": 0.0, + "text": "thing that basically everybody does. Um<00:09:07.280> there<00:09:07.520> is<00:09:07.760> one<00:09:08.040> funny<00:09:08.320> exception,<00:09:09.200> but<00:09:09.400> it" + }, + { + "start": 549.51, + "duration": 0.0, + "text": "Um there is one funny exception, but it" + }, + { + "start": 549.52, + "duration": 0.0, + "text": "Um there is one funny exception, but it is<00:09:09.640> OPT-350M." + }, + { + "start": 551.39, + "duration": 0.0, + "text": "is OPT-350M." + }, + { + "start": 551.4, + "duration": 0.0, + "text": "is OPT-350M. And<00:09:11.680> if<00:09:11.840> you<00:09:12.040> all<00:09:12.200> are<00:09:12.320> familiar<00:09:12.920> with<00:09:13.120> sort<00:09:13.320> of" + }, + { + "start": 553.43, + "duration": 0.0, + "text": "And if you all are familiar with sort of" + }, + { + "start": 553.44, + "duration": 0.0, + "text": "And if you all are familiar with sort of language<00:09:13.800> models,<00:09:14.520> we<00:09:14.640> kind<00:09:14.800> of<00:09:14.920> know<00:09:15.160> OPT<00:09:15.720> in" + }, + { + "start": 555.87, + "duration": 0.0, + "text": "language models, we kind of know OPT in" + }, + { + "start": 555.88, + "duration": 0.0, + "text": "language models, we kind of know OPT in general<00:09:16.680> was,<00:09:17.200> you<00:09:17.280> know,<00:09:17.440> kind<00:09:17.680> of<00:09:17.800> a<00:09:17.880> mess<00:09:18.200> of" + }, + { + "start": 558.31, + "duration": 0.0, + "text": "general was, you know, kind of a mess of" + }, + { + "start": 558.32, + "duration": 0.0, + "text": "general was, you know, kind of a mess of a<00:09:18.360> language<00:09:18.760> model,<00:09:19.120> right?<00:09:19.360> And<00:09:19.480> OPT-350M" + }, + { + "start": 561.31, + "duration": 0.0, + "text": "a language model, right? And OPT-350M" + }, + { + "start": 561.32, + "duration": 0.0, + "text": "a language model, right? And OPT-350M um<00:09:21.400> is<00:09:21.560> even<00:09:21.800> more<00:09:22.040> so<00:09:22.240> because<00:09:22.480> I<00:09:22.520> don't<00:09:22.720> know" + }, + { + "start": 562.95, + "duration": 0.0, + "text": "um is even more so because I don't know" + }, + { + "start": 562.96, + "duration": 0.0, + "text": "um is even more so because I don't know why<00:09:23.360> only<00:09:23.640> that<00:09:23.920> model<00:09:24.720> uh<00:09:24.840> has<00:09:25.280> a<00:09:25.840> post<00:09:26.440> layer" + }, + { + "start": 566.71, + "duration": 0.0, + "text": "why only that model uh has a post layer" + }, + { + "start": 566.72, + "duration": 0.0, + "text": "why only that model uh has a post layer norm<00:09:26.960> in<00:09:27.040> the<00:09:27.160> residual<00:09:27.600> stream." + }, + { + "start": 568.91, + "duration": 0.0, + "text": "norm in the residual stream." + }, + { + "start": 568.92, + "duration": 0.0, + "text": "norm in the residual stream. Okay.<00:09:29.600> So<00:09:29.760> this<00:09:30.000> is<00:09:30.120> one<00:09:30.280> of<00:09:30.320> the<00:09:30.440> things<00:09:30.680> that" + }, + { + "start": 570.83, + "duration": 0.0, + "text": "Okay. So this is one of the things that" + }, + { + "start": 570.84, + "duration": 0.0, + "text": "Okay. So this is one of the things that like<00:09:31.040> everyone<00:09:31.440> agrees<00:09:31.840> on.<00:09:32.480> And<00:09:32.600> so<00:09:32.720> you" + }, + { + "start": 572.79, + "duration": 0.0, + "text": "like everyone agrees on. And so you" + }, + { + "start": 572.8, + "duration": 0.0, + "text": "like everyone agrees on. And so you might<00:09:33.040> wonder<00:09:33.400> like<00:09:33.640> why<00:09:33.920> is<00:09:34.120> this<00:09:34.400> like<00:09:34.640> such" + }, + { + "start": 575.03, + "duration": 0.0, + "text": "might wonder like why is this like such" + }, + { + "start": 575.04, + "duration": 0.0, + "text": "might wonder like why is this like such a,<00:09:35.360> you<00:09:35.440> know," + }, + { + "start": 576.79, + "duration": 0.0, + "text": "a, you know," + }, + { + "start": 576.8, + "duration": 0.0, + "text": "a, you know, uh<00:09:37.320> like<00:09:37.560> a<00:09:37.640> uni-<00:09:38.120> unified<00:09:38.600> thing<00:09:39.160> across<00:09:39.600> all" + }, + { + "start": 579.79, + "duration": 0.0, + "text": "uh like a uni- unified thing across all" + }, + { + "start": 579.8, + "duration": 0.0, + "text": "uh like a uni- unified thing across all the<00:09:39.920> different<00:09:40.760> models?" + }, + { + "start": 582.27, + "duration": 0.0, + "text": "the different models?" + }, + { + "start": 582.28, + "duration": 0.0, + "text": "the different models? Um<00:09:42.880> and<00:09:43.000> if<00:09:43.120> you<00:09:43.240> look<00:09:43.440> at<00:09:43.560> some<00:09:43.720> of<00:09:43.800> the<00:09:43.960> early" + }, + { + "start": 584.35, + "duration": 0.0, + "text": "Um and if you look at some of the early" + }, + { + "start": 584.36, + "duration": 0.0, + "text": "Um and if you look at some of the early works<00:09:44.840> studying<00:09:45.400> like<00:09:45.720> where<00:09:46.000> do<00:09:46.120> you<00:09:46.280> place" + }, + { + "start": 586.59, + "duration": 0.0, + "text": "works studying like where do you place" + }, + { + "start": 586.6, + "duration": 0.0, + "text": "works studying like where do you place the<00:09:46.720> layer<00:09:47.080> norm<00:09:47.400> style<00:09:47.760> research,<00:09:48.920> um<00:09:49.280> what" + }, + { + "start": 589.43, + "duration": 0.0, + "text": "the layer norm style research, um what" + }, + { + "start": 589.44, + "duration": 0.0, + "text": "the layer norm style research, um what you<00:09:49.600> really<00:09:49.920> see<00:09:50.760> is<00:09:51.040> that,<00:09:51.720> you<00:09:51.840> know,<00:09:52.080> the" + }, + { + "start": 592.27, + "duration": 0.0, + "text": "you really see is that, you know, the" + }, + { + "start": 592.28, + "duration": 0.0, + "text": "you really see is that, you know, the early<00:09:52.560> motivation<00:09:53.240> for<00:09:53.400> a<00:09:53.440> lot<00:09:53.720> of<00:09:53.840> this<00:09:54.040> was" + }, + { + "start": 595.27, + "duration": 0.0, + "text": "early motivation for a lot of this was" + }, + { + "start": 595.28, + "duration": 0.0, + "text": "early motivation for a lot of this was when<00:09:55.440> you<00:09:55.520> train<00:09:55.760> a<00:09:55.800> transformer,<00:09:56.600> you<00:09:56.680> need" + }, + { + "start": 596.83, + "duration": 0.0, + "text": "when you train a transformer, you need" + }, + { + "start": 596.84, + "duration": 0.0, + "text": "when you train a transformer, you need to<00:09:56.920> do<00:09:57.120> a<00:09:57.160> warm-up.<00:09:57.760> Actually,<00:09:58.280> you<00:09:58.360> know," + }, + { + "start": 598.47, + "duration": 0.0, + "text": "to do a warm-up. Actually, you know," + }, + { + "start": 598.48, + "duration": 0.0, + "text": "to do a warm-up. Actually, you know, modern<00:09:58.920> transformer<00:09:59.400> training<00:09:59.640> still<00:09:59.880> does" + }, + { + "start": 600.03, + "duration": 0.0, + "text": "modern transformer training still does" + }, + { + "start": 600.04, + "duration": 0.0, + "text": "modern transformer training still does warm-ups<00:10:00.440> as<00:10:00.600> well.<00:10:01.400> But<00:10:01.560> you<00:10:01.840> definitely" + }, + { + "start": 602.27, + "duration": 0.0, + "text": "warm-ups as well. But you definitely" + }, + { + "start": 602.28, + "duration": 0.0, + "text": "warm-ups as well. But you definitely need<00:10:02.440> to<00:10:02.520> do<00:10:02.720> warm-up<00:10:03.120> when<00:10:03.240> you<00:10:03.320> train." + }, + { + "start": 604.55, + "duration": 0.0, + "text": "need to do warm-up when you train." + }, + { + "start": 604.56, + "duration": 0.0, + "text": "need to do warm-up when you train. Now,<00:10:05.200> wouldn't<00:10:05.440> it<00:10:05.520> be<00:10:05.680> nice<00:10:06.080> if<00:10:06.200> we<00:10:06.280> could" + }, + { + "start": 606.43, + "duration": 0.0, + "text": "Now, wouldn't it be nice if we could" + }, + { + "start": 606.44, + "duration": 0.0, + "text": "Now, wouldn't it be nice if we could remove<00:10:06.840> the<00:10:06.960> warm-up,<00:10:07.680> right?<00:10:07.920> So,<00:10:08.000> that<00:10:08.120> was" + }, + { + "start": 608.27, + "duration": 0.0, + "text": "remove the warm-up, right? So, that was" + }, + { + "start": 608.28, + "duration": 0.0, + "text": "remove the warm-up, right? So, that was kind<00:10:08.400> of<00:10:08.480> the<00:10:08.560> initial<00:10:08.880> motivation<00:10:09.480> for<00:10:09.600> a<00:10:09.640> lot" + }, + { + "start": 609.79, + "duration": 0.0, + "text": "kind of the initial motivation for a lot" + }, + { + "start": 609.8, + "duration": 0.0, + "text": "kind of the initial motivation for a lot of<00:10:09.880> this<00:10:10.040> research." + }, + { + "start": 611.31, + "duration": 0.0, + "text": "of this research." + }, + { + "start": 611.32, + "duration": 0.0, + "text": "of this research. But<00:10:11.520> people<00:10:11.800> quickly<00:10:12.160> realized<00:10:12.640> that" + }, + { + "start": 612.79, + "duration": 0.0, + "text": "But people quickly realized that" + }, + { + "start": 612.8, + "duration": 0.0, + "text": "But people quickly realized that removing<00:10:13.280> the<00:10:13.400> warm-up<00:10:14.120> had<00:10:14.320> very<00:10:14.560> serious" + }, + { + "start": 615.15, + "duration": 0.0, + "text": "removing the warm-up had very serious" + }, + { + "start": 615.16, + "duration": 0.0, + "text": "removing the warm-up had very serious issues<00:10:15.640> in<00:10:15.760> terms<00:10:16.000> of<00:10:16.080> the<00:10:16.160> stability<00:10:16.800> and" + }, + { + "start": 616.91, + "duration": 0.0, + "text": "issues in terms of the stability and" + }, + { + "start": 616.92, + "duration": 0.0, + "text": "issues in terms of the stability and convergence" + }, + { + "start": 618.31, + "duration": 0.0, + "text": "convergence" + }, + { + "start": 618.32, + "duration": 0.0, + "text": "convergence of<00:10:18.480> these<00:10:18.680> things,<00:10:19.080> right?<00:10:19.280> So,<00:10:19.360> if<00:10:19.480> you<00:10:19.600> did" + }, + { + "start": 620.23, + "duration": 0.0, + "text": "of these things, right? So, if you did" + }, + { + "start": 620.24, + "duration": 0.0, + "text": "of these things, right? So, if you did post<00:10:20.560> norm<00:10:20.800> plus<00:10:21.040> layer<00:10:21.320> norm,<00:10:21.880> which<00:10:22.080> is,<00:10:22.240> you" + }, + { + "start": 622.31, + "duration": 0.0, + "text": "post norm plus layer norm, which is, you" + }, + { + "start": 622.32, + "duration": 0.0, + "text": "post norm plus layer norm, which is, you know,<00:10:22.440> basically<00:10:22.880> the<00:10:22.960> original<00:10:23.360> transformer" + }, + { + "start": 623.91, + "duration": 0.0, + "text": "know, basically the original transformer" + }, + { + "start": 623.92, + "duration": 0.0, + "text": "know, basically the original transformer thing,<00:10:24.360> you<00:10:24.560> got<00:10:24.720> this<00:10:24.880> purple<00:10:25.240> dash<00:10:25.560> line." + }, + { + "start": 625.83, + "duration": 0.0, + "text": "thing, you got this purple dash line." + }, + { + "start": 625.84, + "duration": 0.0, + "text": "thing, you got this purple dash line. Oh,<00:10:26.040> you<00:10:26.160> just<00:10:26.360> don't<00:10:26.520> converge<00:10:26.960> as<00:10:27.120> well" + }, + { + "start": 628.55, + "duration": 0.0, + "text": "Oh, you just don't converge as well" + }, + { + "start": 628.56, + "duration": 0.0, + "text": "Oh, you just don't converge as well compared<00:10:29.040> to<00:10:29.560> doing<00:10:29.800> something<00:10:30.160> like<00:10:30.360> pre" + }, + { + "start": 630.55, + "duration": 0.0, + "text": "compared to doing something like pre" + }, + { + "start": 630.56, + "duration": 0.0, + "text": "compared to doing something like pre norm.<00:10:30.880> You<00:10:30.960> can<00:10:31.080> ignore<00:10:31.400> the<00:10:31.520> other<00:10:32.200> terms." + }, + { + "start": 632.91, + "duration": 0.0, + "text": "norm. You can ignore the other terms." + }, + { + "start": 632.92, + "duration": 0.0, + "text": "norm. You can ignore the other terms. You<00:10:33.040> would<00:10:33.200> get<00:10:33.600> much<00:10:33.840> nicer<00:10:34.120> convergence" + }, + { + "start": 634.67, + "duration": 0.0, + "text": "You would get much nicer convergence" + }, + { + "start": 634.68, + "duration": 0.0, + "text": "You would get much nicer convergence even<00:10:34.960> without<00:10:35.280> warm-up,<00:10:35.800> right?<00:10:35.920> So,<00:10:36.000> this" + }, + { + "start": 636.15, + "duration": 0.0, + "text": "even without warm-up, right? So, this" + }, + { + "start": 636.16, + "duration": 0.0, + "text": "even without warm-up, right? So, this was<00:10:36.280> the<00:10:36.360> original<00:10:36.840> motivation." + }, + { + "start": 638.63, + "duration": 0.0, + "text": "was the original motivation." + }, + { + "start": 638.64, + "duration": 0.0, + "text": "was the original motivation. But<00:10:38.920> really<00:10:39.200> what<00:10:39.400> people<00:10:39.680> kind<00:10:39.840> of<00:10:39.960> quickly" + }, + { + "start": 640.39, + "duration": 0.0, + "text": "But really what people kind of quickly" + }, + { + "start": 640.4, + "duration": 0.0, + "text": "But really what people kind of quickly realized<00:10:41.560> is<00:10:41.800> that,<00:10:42.280> you<00:10:42.360> know,<00:10:42.560> moving<00:10:43.040> the" + }, + { + "start": 643.63, + "duration": 0.0, + "text": "realized is that, you know, moving the" + }, + { + "start": 643.64, + "duration": 0.0, + "text": "realized is that, you know, moving the layer<00:10:43.920> norms<00:10:44.200> outside<00:10:44.680> the<00:10:44.760> residual<00:10:45.200> stream" + }, + { + "start": 645.91, + "duration": 0.0, + "text": "layer norms outside the residual stream" + }, + { + "start": 645.92, + "duration": 0.0, + "text": "layer norms outside the residual stream has<00:10:46.120> some<00:10:46.520> pretty<00:10:46.840> important<00:10:47.440> implications" + }, + { + "start": 648.23, + "duration": 0.0, + "text": "has some pretty important implications" + }, + { + "start": 648.24, + "duration": 0.0, + "text": "has some pretty important implications as<00:10:48.400> you<00:10:48.520> make<00:10:49.160> your<00:10:49.320> network<00:10:49.800> deeper<00:10:50.520> and<00:10:50.680> as" + }, + { + "start": 650.79, + "duration": 0.0, + "text": "as you make your network deeper and as" + }, + { + "start": 650.8, + "duration": 0.0, + "text": "as you make your network deeper and as you<00:10:50.920> start<00:10:51.200> to<00:10:51.320> grapple<00:10:51.800> with<00:10:52.200> stability" + }, + { + "start": 652.75, + "duration": 0.0, + "text": "you start to grapple with stability" + }, + { + "start": 652.76, + "duration": 0.0, + "text": "you start to grapple with stability issues." + }, + { + "start": 654.39, + "duration": 0.0, + "text": "issues." + }, + { + "start": 654.4, + "duration": 0.0, + "text": "issues. To<00:10:54.520> me,<00:10:54.760> I<00:10:54.840> think<00:10:55.040> the<00:10:55.160> gradient<00:10:55.600> attenuation" + }, + { + "start": 656.31, + "duration": 0.0, + "text": "To me, I think the gradient attenuation" + }, + { + "start": 656.32, + "duration": 0.0, + "text": "To me, I think the gradient attenuation issues<00:10:57.200> are<00:10:57.360> kind<00:10:57.560> of<00:10:57.640> the<00:10:57.720> most<00:10:58.280> clear." + }, + { + "start": 660.27, + "duration": 0.0, + "text": "issues are kind of the most clear." + }, + { + "start": 660.28, + "duration": 0.0, + "text": "issues are kind of the most clear. When<00:11:00.480> you<00:11:00.600> sort<00:11:00.760> of<00:11:00.880> talk<00:11:01.160> to<00:11:01.280> people<00:11:01.640> who<00:11:01.800> do" + }, + { + "start": 662.11, + "duration": 0.0, + "text": "When you sort of talk to people who do" + }, + { + "start": 662.12, + "duration": 0.0, + "text": "When you sort of talk to people who do architecture<00:11:02.640> design,<00:11:03.320> I'm<00:11:03.440> not<00:11:03.680> really<00:11:04.320> one" + }, + { + "start": 664.63, + "duration": 0.0, + "text": "architecture design, I'm not really one" + }, + { + "start": 664.64, + "duration": 0.0, + "text": "architecture design, I'm not really one of<00:11:04.760> the<00:11:05.040> the<00:11:05.120> people<00:11:05.360> that<00:11:05.600> deeply<00:11:05.920> engages<00:11:06.320> in" + }, + { + "start": 666.43, + "duration": 0.0, + "text": "of the the people that deeply engages in" + }, + { + "start": 666.44, + "duration": 0.0, + "text": "of the the people that deeply engages in this.<00:11:06.640> But<00:11:06.760> one<00:11:06.880> of<00:11:06.960> the<00:11:07.040> things<00:11:07.280> that<00:11:07.440> people" + }, + { + "start": 667.71, + "duration": 0.0, + "text": "this. But one of the things that people" + }, + { + "start": 667.72, + "duration": 0.0, + "text": "this. But one of the things that people often<00:11:08.040> say<00:11:08.520> is<00:11:08.800> keep<00:11:09.080> your<00:11:09.200> residual<00:11:09.640> stream" + }, + { + "start": 670.03, + "duration": 0.0, + "text": "often say is keep your residual stream" + }, + { + "start": 670.04, + "duration": 0.0, + "text": "often say is keep your residual stream clean,<00:11:10.600> right?<00:11:10.839> So,<00:11:10.920> in<00:11:11.000> this<00:11:11.200> case,<00:11:11.800> you<00:11:11.920> have" + }, + { + "start": 672.03, + "duration": 0.0, + "text": "clean, right? So, in this case, you have" + }, + { + "start": 672.04, + "duration": 0.0, + "text": "clean, right? So, in this case, you have your<00:11:12.200> X's<00:11:12.520> coming<00:11:12.800> in<00:11:12.920> from<00:11:13.040> the<00:11:13.120> bottom<00:11:13.640> on" + }, + { + "start": 673.75, + "duration": 0.0, + "text": "your X's coming in from the bottom on" + }, + { + "start": 673.76, + "duration": 0.0, + "text": "your X's coming in from the bottom on the<00:11:13.839> pre<00:11:14.080> norm<00:11:14.280> side,<00:11:15.000> and<00:11:15.160> this<00:11:15.400> X<00:11:15.920> propagates" + }, + { + "start": 676.51, + "duration": 0.0, + "text": "the pre norm side, and this X propagates" + }, + { + "start": 676.52, + "duration": 0.0, + "text": "the pre norm side, and this X propagates all<00:11:16.760> the<00:11:16.839> way<00:11:17.000> up<00:11:17.120> to<00:11:17.200> the<00:11:17.280> top,<00:11:17.600> right?<00:11:17.760> All" + }, + { + "start": 677.91, + "duration": 0.0, + "text": "all the way up to the top, right? All" + }, + { + "start": 677.92, + "duration": 0.0, + "text": "all the way up to the top, right? All the<00:11:18.000> way<00:11:18.120> up<00:11:18.320> to<00:11:18.400> your<00:11:18.760> your<00:11:18.920> final<00:11:19.280> output." + }, + { + "start": 680.03, + "duration": 0.0, + "text": "the way up to your your final output." + }, + { + "start": 680.04, + "duration": 0.0, + "text": "the way up to your your final output. And<00:11:20.160> that<00:11:20.360> allows<00:11:20.680> gradients<00:11:21.160> to<00:11:21.280> propagate" + }, + { + "start": 681.79, + "duration": 0.0, + "text": "And that allows gradients to propagate" + }, + { + "start": 681.8, + "duration": 0.0, + "text": "And that allows gradients to propagate if<00:11:21.920> you<00:11:21.960> in<00:11:22.080> the<00:11:22.160> backward<00:11:22.680> pass<00:11:23.200> straight" + }, + { + "start": 683.75, + "duration": 0.0, + "text": "if you in the backward pass straight" + }, + { + "start": 683.76, + "duration": 0.0, + "text": "if you in the backward pass straight through<00:11:23.960> this,<00:11:24.400> right?" + }, + { + "start": 685.59, + "duration": 0.0, + "text": "through this, right?" + }, + { + "start": 685.6, + "duration": 0.0, + "text": "through this, right? Um<00:11:25.640> that<00:11:25.800> makes<00:11:26.120> gradient<00:11:26.520> propagation<00:11:27.040> very" + }, + { + "start": 687.27, + "duration": 0.0, + "text": "Um that makes gradient propagation very" + }, + { + "start": 687.28, + "duration": 0.0, + "text": "Um that makes gradient propagation very simple," + }, + { + "start": 688.59, + "duration": 0.0, + "text": "simple," + }, + { + "start": 688.6, + "duration": 0.0, + "text": "simple, which<00:11:28.839> improves<00:11:29.400> both<00:11:29.640> stability<00:11:30.480> and<00:11:31.000> signal" + }, + { + "start": 691.55, + "duration": 0.0, + "text": "which improves both stability and signal" + }, + { + "start": 691.56, + "duration": 0.0, + "text": "which improves both stability and signal propagation.<00:11:32.280> And<00:11:32.400> that's<00:11:32.560> sort<00:11:32.720> of<00:11:33.160> what" + }, + { + "start": 693.31, + "duration": 0.0, + "text": "propagation. And that's sort of what" + }, + { + "start": 693.32, + "duration": 0.0, + "text": "propagation. And that's sort of what people<00:11:33.600> realized<00:11:34.080> very,<00:11:34.360> very<00:11:34.640> quickly,<00:11:35.040> that" + }, + { + "start": 695.19, + "duration": 0.0, + "text": "people realized very, very quickly, that" + }, + { + "start": 695.2, + "duration": 0.0, + "text": "people realized very, very quickly, that if<00:11:35.360> you<00:11:35.480> do<00:11:36.080> something<00:11:36.680> like<00:11:37.200> pre<00:11:37.480> norm<00:11:37.920> in" + }, + { + "start": 698.03, + "duration": 0.0, + "text": "if you do something like pre norm in" + }, + { + "start": 698.04, + "duration": 0.0, + "text": "if you do something like pre norm in blue" + }, + { + "start": 699.15, + "duration": 0.0, + "text": "blue" + }, + { + "start": 699.16, + "duration": 0.0, + "text": "blue initialization,<00:11:40.520> sort<00:11:40.720> of<00:11:40.800> the<00:11:40.880> gradient" + }, + { + "start": 701.43, + "duration": 0.0, + "text": "initialization, sort of the gradient" + }, + { + "start": 701.44, + "duration": 0.0, + "text": "initialization, sort of the gradient size<00:11:41.800> is<00:11:41.920> kind<00:11:42.080> of<00:11:42.160> remains<00:11:42.600> the<00:11:42.720> same,<00:11:43.360> right?" + }, + { + "start": 703.59, + "duration": 0.0, + "text": "size is kind of remains the same, right?" + }, + { + "start": 703.6, + "duration": 0.0, + "text": "size is kind of remains the same, right? Because<00:11:43.839> you<00:11:43.920> have<00:11:44.080> this<00:11:44.200> nice<00:11:44.600> straight" + }, + { + "start": 704.99, + "duration": 0.0, + "text": "Because you have this nice straight" + }, + { + "start": 705.0, + "duration": 0.0, + "text": "Because you have this nice straight through<00:11:45.200> propagation<00:11:45.839> in<00:11:45.920> the<00:11:46.000> backward" + }, + { + "start": 706.43, + "duration": 0.0, + "text": "through propagation in the backward" + }, + { + "start": 706.44, + "duration": 0.0, + "text": "through propagation in the backward pass.<00:11:47.280> On<00:11:47.480> the<00:11:47.560> other<00:11:47.760> hand,<00:11:48.000> if<00:11:48.120> you<00:11:48.200> have" + }, + { + "start": 708.35, + "duration": 0.0, + "text": "pass. On the other hand, if you have" + }, + { + "start": 708.36, + "duration": 0.0, + "text": "pass. On the other hand, if you have post<00:11:48.800> layer<00:11:49.080> norm,<00:11:49.720> you<00:11:49.839> have<00:11:50.080> these<00:11:50.280> kind<00:11:50.440> of" + }, + { + "start": 710.63, + "duration": 0.0, + "text": "post layer norm, you have these kind of" + }, + { + "start": 710.64, + "duration": 0.0, + "text": "post layer norm, you have these kind of complicated<00:11:51.320> effects<00:11:51.760> that<00:11:51.920> happen<00:11:52.720> because" + }, + { + "start": 713.07, + "duration": 0.0, + "text": "complicated effects that happen because" + }, + { + "start": 713.08, + "duration": 0.0, + "text": "complicated effects that happen because you're<00:11:53.240> layer<00:11:53.560> norming<00:11:54.280> each<00:11:54.600> time<00:11:55.000> you're" + }, + { + "start": 715.15, + "duration": 0.0, + "text": "you're layer norming each time you're" + }, + { + "start": 715.16, + "duration": 0.0, + "text": "you're layer norming each time you're going<00:11:55.400> through<00:11:55.560> a<00:11:55.600> transformer<00:11:56.240> block.<00:11:56.600> And" + }, + { + "start": 716.67, + "duration": 0.0, + "text": "going through a transformer block. And" + }, + { + "start": 716.68, + "duration": 0.0, + "text": "going through a transformer block. And that's<00:11:56.839> going<00:11:56.960> to<00:11:57.040> change<00:11:57.280> the<00:11:57.360> norm<00:11:57.640> of<00:11:57.720> your" + }, + { + "start": 717.829, + "duration": 0.0, + "text": "that's going to change the norm of your" + }, + { + "start": 717.839, + "duration": 0.0, + "text": "that's going to change the norm of your gradients<00:11:58.680> as<00:11:58.839> you<00:11:58.960> go<00:11:59.120> backwards<00:11:59.680> through." + }, + { + "start": 720.71, + "duration": 0.0, + "text": "gradients as you go backwards through." + }, + { + "start": 720.72, + "duration": 0.0, + "text": "gradients as you go backwards through. So,<00:12:01.480> you<00:12:01.600> can<00:12:01.720> kind<00:12:01.920> of<00:12:02.040> see," + }, + { + "start": 723.63, + "duration": 0.0, + "text": "So, you can kind of see," + }, + { + "start": 723.64, + "duration": 0.0, + "text": "So, you can kind of see, you<00:12:03.839> know,<00:12:04.080> from<00:12:04.280> the<00:12:04.400> principle<00:12:04.880> of<00:12:05.040> keep" + }, + { + "start": 725.31, + "duration": 0.0, + "text": "you know, from the principle of keep" + }, + { + "start": 725.32, + "duration": 0.0, + "text": "you know, from the principle of keep your<00:12:05.440> residual<00:12:05.880> stream<00:12:06.200> clean,<00:12:06.839> that<00:12:07.120> pre" + }, + { + "start": 727.35, + "duration": 0.0, + "text": "your residual stream clean, that pre" + }, + { + "start": 727.36, + "duration": 0.0, + "text": "your residual stream clean, that pre norm<00:12:07.560> makes<00:12:07.800> a<00:12:07.880> lot<00:12:08.120> of<00:12:08.240> sense." + }, + { + "start": 729.39, + "duration": 0.0, + "text": "norm makes a lot of sense." + }, + { + "start": 729.4, + "duration": 0.0, + "text": "norm makes a lot of sense. People<00:12:09.720> also<00:12:10.000> realized<00:12:10.760> through" + }, + { + "start": 730.91, + "duration": 0.0, + "text": "People also realized through" + }, + { + "start": 730.92, + "duration": 0.0, + "text": "People also realized through experimentation<00:12:11.839> that<00:12:11.960> this<00:12:12.120> also<00:12:12.360> improves" + }, + { + "start": 732.95, + "duration": 0.0, + "text": "experimentation that this also improves" + }, + { + "start": 732.96, + "duration": 0.0, + "text": "experimentation that this also improves stability<00:12:13.640> in<00:12:13.800> general,<00:12:14.240> that<00:12:14.720> the<00:12:14.839> sizes<00:12:15.440> and" + }, + { + "start": 735.55, + "duration": 0.0, + "text": "stability in general, that the sizes and" + }, + { + "start": 735.56, + "duration": 0.0, + "text": "stability in general, that the sizes and frequencies<00:12:16.720> of<00:12:16.880> gradient<00:12:17.360> spikes" + }, + { + "start": 738.63, + "duration": 0.0, + "text": "frequencies of gradient spikes" + }, + { + "start": 738.64, + "duration": 0.0, + "text": "frequencies of gradient spikes were<00:12:19.000> improved<00:12:19.760> under<00:12:20.040> pre<00:12:20.280> norm<00:12:20.600> compared<00:12:21.080> to" + }, + { + "start": 741.23, + "duration": 0.0, + "text": "were improved under pre norm compared to" + }, + { + "start": 741.24, + "duration": 0.0, + "text": "were improved under pre norm compared to post<00:12:21.520> norm.<00:12:21.760> And,<00:12:21.920> you<00:12:22.160> know,<00:12:22.320> this<00:12:22.560> is<00:12:22.680> a" + }, + { + "start": 742.71, + "duration": 0.0, + "text": "post norm. And, you know, this is a" + }, + { + "start": 742.72, + "duration": 0.0, + "text": "post norm. And, you know, this is a figure<00:12:23.040> from<00:12:23.240> Salazar<00:12:23.720> and<00:12:23.839> UN,<00:12:24.520> who<00:12:24.680> were<00:12:24.800> one" + }, + { + "start": 744.95, + "duration": 0.0, + "text": "figure from Salazar and UN, who were one" + }, + { + "start": 744.96, + "duration": 0.0, + "text": "figure from Salazar and UN, who were one of<00:12:25.040> the<00:12:25.120> first<00:12:25.440> ones,<00:12:25.720> I<00:12:25.800> think,<00:12:26.080> to<00:12:26.400> to<00:12:26.520> study" + }, + { + "start": 746.79, + "duration": 0.0, + "text": "of the first ones, I think, to to study" + }, + { + "start": 746.8, + "duration": 0.0, + "text": "of the first ones, I think, to to study this<00:12:27.000> phenomena<00:12:27.480> carefully." + }, + { + "start": 748.75, + "duration": 0.0, + "text": "this phenomena carefully." + }, + { + "start": 748.76, + "duration": 0.0, + "text": "this phenomena carefully. I<00:12:28.920> think<00:12:29.080> this<00:12:29.240> is<00:12:29.360> the<00:12:29.440> reason<00:12:29.800> why<00:12:29.920> it<00:12:30.040> stuck" + }, + { + "start": 750.31, + "duration": 0.0, + "text": "I think this is the reason why it stuck" + }, + { + "start": 750.32, + "duration": 0.0, + "text": "I think this is the reason why it stuck around,<00:12:30.800> right?<00:12:31.680> Stability<00:12:32.560> and<00:12:32.720> the<00:12:32.800> ability" + }, + { + "start": 753.11, + "duration": 0.0, + "text": "around, right? Stability and the ability" + }, + { + "start": 753.12, + "duration": 0.0, + "text": "around, right? Stability and the ability to<00:12:33.240> go<00:12:33.400> deep<00:12:33.720> are<00:12:33.800> both<00:12:34.040> very,<00:12:34.320> very<00:12:34.640> important" + }, + { + "start": 755.47, + "duration": 0.0, + "text": "to go deep are both very, very important" + }, + { + "start": 755.48, + "duration": 0.0, + "text": "to go deep are both very, very important for<00:12:35.880> modern<00:12:36.360> large<00:12:36.800> language<00:12:37.120> models.<00:12:37.760> And" + }, + { + "start": 757.91, + "duration": 0.0, + "text": "for modern large language models. And" + }, + { + "start": 757.92, + "duration": 0.0, + "text": "for modern large language models. And so,<00:12:38.160> this<00:12:38.360> idea<00:12:38.760> of<00:12:38.960> moving<00:12:39.400> your<00:12:39.560> layer<00:12:39.760> norm" + }, + { + "start": 759.99, + "duration": 0.0, + "text": "so, this idea of moving your layer norm" + }, + { + "start": 760.0, + "duration": 0.0, + "text": "so, this idea of moving your layer norm outside<00:12:40.480> of<00:12:40.560> the<00:12:40.640> residual<00:12:41.120> stream<00:12:41.760> is<00:12:41.920> one" + }, + { + "start": 762.07, + "duration": 0.0, + "text": "outside of the residual stream is one" + }, + { + "start": 762.08, + "duration": 0.0, + "text": "outside of the residual stream is one that<00:12:42.280> basically<00:12:42.839> everyone<00:12:43.960> has<00:12:44.120> adopted." + }, + { + "start": 765.47, + "duration": 0.0, + "text": "that basically everyone has adopted." + }, + { + "start": 765.48, + "duration": 0.0, + "text": "that basically everyone has adopted. Um<00:12:46.440> so<00:12:46.560> now,<00:12:47.280> you<00:12:47.440> know,<00:12:47.760> if<00:12:48.120> putting<00:12:48.400> layer" + }, + { + "start": 768.63, + "duration": 0.0, + "text": "Um so now, you know, if putting layer" + }, + { + "start": 768.64, + "duration": 0.0, + "text": "Um so now, you know, if putting layer norms<00:12:49.240> in<00:12:49.400> residual<00:12:49.760> streams<00:12:50.240> is<00:12:50.440> bad," + }, + { + "start": 772.35, + "duration": 0.0, + "text": "norms in residual streams is bad," + }, + { + "start": 772.36, + "duration": 0.0, + "text": "norms in residual streams is bad, why<00:12:52.600> does<00:12:52.800> layer<00:12:53.000> norm<00:12:53.200> have<00:12:53.400> to<00:12:53.480> be<00:12:54.000> at<00:12:54.120> the" + }, + { + "start": 774.19, + "duration": 0.0, + "text": "why does layer norm have to be at the" + }, + { + "start": 774.2, + "duration": 0.0, + "text": "why does layer norm have to be at the start?<00:12:54.600> Of<00:12:54.720> course,<00:12:55.000> we<00:12:55.080> have<00:12:55.280> pre<00:12:55.520> norm," + }, + { + "start": 775.79, + "duration": 0.0, + "text": "start? Of course, we have pre norm," + }, + { + "start": 775.8, + "duration": 0.0, + "text": "start? Of course, we have pre norm, which<00:12:55.960> is,<00:12:56.240> you<00:12:56.320> know,<00:12:56.440> before<00:12:56.920> our" + }, + { + "start": 776.99, + "duration": 0.0, + "text": "which is, you know, before our" + }, + { + "start": 777.0, + "duration": 0.0, + "text": "which is, you know, before our computation,<00:12:57.560> but<00:12:57.680> we<00:12:57.760> could<00:12:57.880> have<00:12:58.040> it<00:12:58.280> after" + }, + { + "start": 778.67, + "duration": 0.0, + "text": "computation, but we could have it after" + }, + { + "start": 778.68, + "duration": 0.0, + "text": "computation, but we could have it after computation<00:12:59.280> as<00:12:59.440> well,<00:12:59.680> right?<00:12:59.800> That's" + }, + { + "start": 779.95, + "duration": 0.0, + "text": "computation as well, right? That's" + }, + { + "start": 779.96, + "duration": 0.0, + "text": "computation as well, right? That's equally<00:13:00.480> good<00:13:00.640> at<00:13:00.760> least<00:13:00.960> under<00:13:01.160> that" + }, + { + "start": 781.31, + "duration": 0.0, + "text": "equally good at least under that" + }, + { + "start": 781.32, + "duration": 0.0, + "text": "equally good at least under that knowledge<00:13:01.680> logic." + }, + { + "start": 782.95, + "duration": 0.0, + "text": "knowledge logic." + }, + { + "start": 782.96, + "duration": 0.0, + "text": "knowledge logic. Um<00:13:03.600> and<00:13:04.040> that's<00:13:04.200> exactly<00:13:04.720> right.<00:13:05.600> Many<00:13:06.080> recent" + }, + { + "start": 786.43, + "duration": 0.0, + "text": "Um and that's exactly right. Many recent" + }, + { + "start": 786.44, + "duration": 0.0, + "text": "Um and that's exactly right. Many recent models<00:13:07.000> like<00:13:07.200> Grok<00:13:07.640> or<00:13:07.800> Gemma<00:13:08.080> 2<00:13:08.440> or<00:13:08.600> Olmo<00:13:08.920> 2" + }, + { + "start": 790.15, + "duration": 0.0, + "text": "models like Grok or Gemma 2 or Olmo 2" + }, + { + "start": 790.16, + "duration": 0.0, + "text": "models like Grok or Gemma 2 or Olmo 2 have<00:13:10.440> the<00:13:10.560> structure<00:13:11.040> where<00:13:11.160> they<00:13:11.360> moved<00:13:11.760> the" + }, + { + "start": 791.829, + "duration": 0.0, + "text": "have the structure where they moved the" + }, + { + "start": 791.839, + "duration": 0.0, + "text": "have the structure where they moved the layer<00:13:12.120> norm<00:13:12.400> after<00:13:12.680> the<00:13:12.800> computation.<00:13:13.839> So," + }, + { + "start": 793.91, + "duration": 0.0, + "text": "layer norm after the computation. So," + }, + { + "start": 793.92, + "duration": 0.0, + "text": "layer norm after the computation. So, it's<00:13:14.040> a<00:13:14.080> post<00:13:14.480> norm<00:13:14.720> of<00:13:14.839> a<00:13:14.880> kind,<00:13:15.240> but<00:13:15.360> it's" + }, + { + "start": 795.47, + "duration": 0.0, + "text": "it's a post norm of a kind, but it's" + }, + { + "start": 795.48, + "duration": 0.0, + "text": "it's a post norm of a kind, but it's outside<00:13:15.960> the<00:13:16.040> residual<00:13:16.480> stream." + }, + { + "start": 797.39, + "duration": 0.0, + "text": "outside the residual stream." + }, + { + "start": 797.4, + "duration": 0.0, + "text": "outside the residual stream. Other<00:13:17.680> models<00:13:18.000> still<00:13:18.320> actually<00:13:18.640> just<00:13:18.839> put" + }, + { + "start": 798.99, + "duration": 0.0, + "text": "Other models still actually just put" + }, + { + "start": 799.0, + "duration": 0.0, + "text": "Other models still actually just put layer<00:13:19.160> norms<00:13:19.440> everywhere.<00:13:19.760> They<00:13:19.880> put<00:13:20.000> a<00:13:20.040> layer" + }, + { + "start": 800.23, + "duration": 0.0, + "text": "layer norms everywhere. They put a layer" + }, + { + "start": 800.24, + "duration": 0.0, + "text": "layer norms everywhere. They put a layer norm<00:13:20.440> here,<00:13:20.880> they<00:13:21.000> put<00:13:21.120> a<00:13:21.160> layer<00:13:21.400> norm<00:13:21.680> after." + }, + { + "start": 803.19, + "duration": 0.0, + "text": "norm here, they put a layer norm after." + }, + { + "start": 803.2, + "duration": 0.0, + "text": "norm here, they put a layer norm after. I'll<00:13:23.360> get<00:13:23.560> to<00:13:23.640> this<00:13:23.839> later<00:13:24.280> as<00:13:24.400> we<00:13:24.520> talk<00:13:24.760> about" + }, + { + "start": 804.99, + "duration": 0.0, + "text": "I'll get to this later as we talk about" + }, + { + "start": 805.0, + "duration": 0.0, + "text": "I'll get to this later as we talk about stability,<00:13:25.880> but<00:13:26.080> one<00:13:26.240> of<00:13:26.320> the<00:13:26.560> other<00:13:26.800> lessons" + }, + { + "start": 807.39, + "duration": 0.0, + "text": "stability, but one of the other lessons" + }, + { + "start": 807.4, + "duration": 0.0, + "text": "stability, but one of the other lessons that<00:13:27.600> it<00:13:27.720> seems<00:13:28.040> to<00:13:28.080> have<00:13:28.240> held<00:13:28.480> up<00:13:28.600> very<00:13:28.839> well" + }, + { + "start": 809.47, + "duration": 0.0, + "text": "that it seems to have held up very well" + }, + { + "start": 809.48, + "duration": 0.0, + "text": "that it seems to have held up very well is<00:13:29.800> if<00:13:29.920> you<00:13:30.000> have<00:13:30.120> stability<00:13:30.600> issues,<00:13:30.960> you<00:13:31.040> can" + }, + { + "start": 811.15, + "duration": 0.0, + "text": "is if you have stability issues, you can" + }, + { + "start": 811.16, + "duration": 0.0, + "text": "is if you have stability issues, you can kind<00:13:31.320> of<00:13:31.400> sprinkle<00:13:31.920> in<00:13:32.080> layer<00:13:32.320> norms" + }, + { + "start": 812.67, + "duration": 0.0, + "text": "kind of sprinkle in layer norms" + }, + { + "start": 812.68, + "duration": 0.0, + "text": "kind of sprinkle in layer norms everywhere,<00:13:33.320> and<00:13:33.720> that<00:13:33.839> will<00:13:33.960> generally" + }, + { + "start": 814.47, + "duration": 0.0, + "text": "everywhere, and that will generally" + }, + { + "start": 814.48, + "duration": 0.0, + "text": "everywhere, and that will generally improve<00:13:34.839> stability." + }, + { + "start": 816.23, + "duration": 0.0, + "text": "improve stability." + }, + { + "start": 816.24, + "duration": 0.0, + "text": "improve stability. It's<00:13:36.360> almost<00:13:36.640> very<00:13:36.839> strange<00:13:37.280> to<00:13:37.360> be<00:13:37.480> saying" + }, + { + "start": 817.79, + "duration": 0.0, + "text": "It's almost very strange to be saying" + }, + { + "start": 817.8, + "duration": 0.0, + "text": "It's almost very strange to be saying this<00:13:37.920> cuz<00:13:38.040> it's<00:13:38.160> so<00:13:38.360> ridiculous.<00:13:38.960> And<00:13:39.120> yet," + }, + { + "start": 819.59, + "duration": 0.0, + "text": "this cuz it's so ridiculous. And yet," + }, + { + "start": 819.6, + "duration": 0.0, + "text": "this cuz it's so ridiculous. And yet, that<00:13:39.800> statement<00:13:40.200> has<00:13:40.360> actually<00:13:40.640> been<00:13:40.800> proven" + }, + { + "start": 821.15, + "duration": 0.0, + "text": "that statement has actually been proven" + }, + { + "start": 821.16, + "duration": 0.0, + "text": "that statement has actually been proven right.<00:13:41.480> Every<00:13:41.680> time,<00:13:42.440> you<00:13:42.520> know,<00:13:42.640> people<00:13:42.880> have" + }, + { + "start": 822.99, + "duration": 0.0, + "text": "right. Every time, you know, people have" + }, + { + "start": 823.0, + "duration": 0.0, + "text": "right. Every time, you know, people have encountered<00:13:43.400> stability<00:13:43.839> issues,<00:13:44.480> they<00:13:44.600> say," + }, + { + "start": 824.79, + "duration": 0.0, + "text": "encountered stability issues, they say," + }, + { + "start": 824.8, + "duration": 0.0, + "text": "encountered stability issues, they say, \"Oh,<00:13:44.960> but<00:13:45.120> what<00:13:45.240> if<00:13:45.360> we<00:13:45.440> just<00:13:45.839> throw<00:13:46.040> a<00:13:46.080> layer" + }, + { + "start": 826.31, + "duration": 0.0, + "text": "\"Oh, but what if we just throw a layer" + }, + { + "start": 826.32, + "duration": 0.0, + "text": "\"Oh, but what if we just throw a layer norm<00:13:46.520> into<00:13:46.680> attention?\"<00:13:47.520> Turns<00:13:47.880> out<00:13:48.000> that" + }, + { + "start": 828.19, + "duration": 0.0, + "text": "norm into attention?\" Turns out that" + }, + { + "start": 828.2, + "duration": 0.0, + "text": "norm into attention?\" Turns out that works,<00:13:48.480> too.<00:13:48.800> We'll<00:13:48.920> get<00:13:49.040> to<00:13:49.120> that<00:13:49.280> later.<00:13:49.560> So," + }, + { + "start": 829.67, + "duration": 0.0, + "text": "works, too. We'll get to that later. So," + }, + { + "start": 829.68, + "duration": 0.0, + "text": "works, too. We'll get to that later. So, okay,<00:13:49.880> that's<00:13:50.040> post<00:13:50.320> norm" + }, + { + "start": 831.27, + "duration": 0.0, + "text": "okay, that's post norm" + }, + { + "start": 831.28, + "duration": 0.0, + "text": "okay, that's post norm or<00:13:51.400> double<00:13:51.720> norm<00:13:51.960> in<00:13:52.040> this<00:13:52.200> case<00:13:52.440> where<00:13:52.520> you" + }, + { + "start": 832.59, + "duration": 0.0, + "text": "or double norm in this case where you" + }, + { + "start": 832.6, + "duration": 0.0, + "text": "or double norm in this case where you have<00:13:52.839> two<00:13:53.440> layer<00:13:53.680> norms<00:13:53.920> here." + }, + { + "start": 835.07, + "duration": 0.0, + "text": "have two layer norms here." + }, + { + "start": 835.08, + "duration": 0.0, + "text": "have two layer norms here. Okay.<00:13:56.040> The<00:13:56.400> other<00:13:56.680> thing<00:13:57.080> that<00:13:57.280> you<00:13:57.400> can<00:13:57.560> do<00:13:57.880> is" + }, + { + "start": 838.03, + "duration": 0.0, + "text": "Okay. The other thing that you can do is" + }, + { + "start": 838.04, + "duration": 0.0, + "text": "Okay. The other thing that you can do is in<00:13:58.160> the<00:13:58.280> original<00:13:58.760> transformer,<00:13:59.760> you<00:13:59.920> have" + }, + { + "start": 840.11, + "duration": 0.0, + "text": "in the original transformer, you have" + }, + { + "start": 840.12, + "duration": 0.0, + "text": "in the original transformer, you have the<00:14:00.240> layer<00:14:00.520> norm,<00:14:00.839> which<00:14:00.960> is<00:14:01.120> this<00:14:01.400> operation" + }, + { + "start": 841.91, + "duration": 0.0, + "text": "the layer norm, which is this operation" + }, + { + "start": 841.92, + "duration": 0.0, + "text": "the layer norm, which is this operation right<00:14:02.120> here.<00:14:02.440> So,<00:14:02.560> you<00:14:02.680> have<00:14:02.920> your" + }, + { + "start": 843.07, + "duration": 0.0, + "text": "right here. So, you have your" + }, + { + "start": 843.08, + "duration": 0.0, + "text": "right here. So, you have your activations<00:14:03.839> X." + }, + { + "start": 844.75, + "duration": 0.0, + "text": "activations X." + }, + { + "start": 844.76, + "duration": 0.0, + "text": "activations X. You're<00:14:04.880> going<00:14:05.000> to<00:14:05.080> mean<00:14:05.320> subtract,<00:14:05.880> divide" + }, + { + "start": 846.11, + "duration": 0.0, + "text": "You're going to mean subtract, divide" + }, + { + "start": 846.12, + "duration": 0.0, + "text": "You're going to mean subtract, divide the<00:14:06.200> variance,<00:14:06.720> and<00:14:06.800> then<00:14:06.920> scale<00:14:07.320> it<00:14:07.440> back<00:14:07.720> up," + }, + { + "start": 847.95, + "duration": 0.0, + "text": "the variance, and then scale it back up," + }, + { + "start": 847.96, + "duration": 0.0, + "text": "the variance, and then scale it back up, right?<00:14:08.760> And<00:14:08.839> this<00:14:09.040> works<00:14:09.240> just<00:14:09.520> fine,<00:14:09.839> right?" + }, + { + "start": 849.95, + "duration": 0.0, + "text": "right? And this works just fine, right?" + }, + { + "start": 849.96, + "duration": 0.0, + "text": "right? And this works just fine, right? It's<00:14:10.080> not<00:14:10.200> like<00:14:10.360> this<00:14:10.480> is<00:14:10.640> wrong.<00:14:11.640> And<00:14:11.800> many" + }, + { + "start": 852.03, + "duration": 0.0, + "text": "It's not like this is wrong. And many" + }, + { + "start": 852.04, + "duration": 0.0, + "text": "It's not like this is wrong. And many models<00:14:12.480> have<00:14:12.640> successfully<00:14:13.160> trained<00:14:13.520> on<00:14:13.640> this" + }, + { + "start": 853.829, + "duration": 0.0, + "text": "models have successfully trained on this" + }, + { + "start": 853.839, + "duration": 0.0, + "text": "models have successfully trained on this scheme." + }, + { + "start": 854.99, + "duration": 0.0, + "text": "scheme." + }, + { + "start": 855.0, + "duration": 0.0, + "text": "scheme. But<00:14:15.160> basically,<00:14:16.120> most<00:14:16.680> or<00:14:16.959> all<00:14:17.320> modern" + }, + { + "start": 857.79, + "duration": 0.0, + "text": "But basically, most or all modern" + }, + { + "start": 857.8, + "duration": 0.0, + "text": "But basically, most or all modern models,<00:14:18.440> I<00:14:18.520> think," + }, + { + "start": 859.63, + "duration": 0.0, + "text": "models, I think," + }, + { + "start": 859.64, + "duration": 0.0, + "text": "models, I think, use<00:14:19.880> RMS<00:14:20.320> norm,<00:14:20.680> which<00:14:21.000> doesn't<00:14:21.320> subtract<00:14:21.760> the" + }, + { + "start": 861.829, + "duration": 0.0, + "text": "use RMS norm, which doesn't subtract the" + }, + { + "start": 861.839, + "duration": 0.0, + "text": "use RMS norm, which doesn't subtract the mean<00:14:22.280> or<00:14:22.520> add<00:14:22.680> a<00:14:22.720> bias<00:14:23.160> term,<00:14:23.480> right?<00:14:23.720> So,<00:14:23.959> it's" + }, + { + "start": 864.23, + "duration": 0.0, + "text": "mean or add a bias term, right? So, it's" + }, + { + "start": 864.24, + "duration": 0.0, + "text": "mean or add a bias term, right? So, it's just<00:14:24.640> a<00:14:24.680> scaling<00:14:25.000> down<00:14:25.560> and<00:14:25.720> scaling<00:14:26.120> back<00:14:26.440> up," + }, + { + "start": 866.71, + "duration": 0.0, + "text": "just a scaling down and scaling back up," + }, + { + "start": 866.72, + "duration": 0.0, + "text": "just a scaling down and scaling back up, right?<00:14:26.839> So,<00:14:26.920> you<00:14:26.959> can<00:14:27.079> see<00:14:27.200> this<00:14:27.360> in<00:14:27.440> the" + }, + { + "start": 867.51, + "duration": 0.0, + "text": "right? So, you can see this in the" + }, + { + "start": 867.52, + "duration": 0.0, + "text": "right? So, you can see this in the equation<00:14:27.920> here." + }, + { + "start": 868.75, + "duration": 0.0, + "text": "equation here." + }, + { + "start": 868.76, + "duration": 0.0, + "text": "equation here. Um<00:14:29.720> and<00:14:29.920> really,<00:14:30.440> layer<00:14:30.760> norm<00:14:30.959> is<00:14:31.320> more" + }, + { + "start": 871.59, + "duration": 0.0, + "text": "Um and really, layer norm is more" + }, + { + "start": 871.6, + "duration": 0.0, + "text": "Um and really, layer norm is more expressive<00:14:32.160> than<00:14:32.320> RMS<00:14:32.760> norm.<00:14:33.079> So,<00:14:33.160> there's" + }, + { + "start": 873.35, + "duration": 0.0, + "text": "expressive than RMS norm. So, there's" + }, + { + "start": 873.36, + "duration": 0.0, + "text": "expressive than RMS norm. So, there's really<00:14:33.720> representationally<00:14:34.839> no<00:14:35.079> reason<00:14:36.000> why" + }, + { + "start": 876.15, + "duration": 0.0, + "text": "really representationally no reason why" + }, + { + "start": 876.16, + "duration": 0.0, + "text": "really representationally no reason why you<00:14:36.280> have<00:14:36.600> to<00:14:36.720> use<00:14:36.920> RMS<00:14:37.320> norm." + }, + { + "start": 878.27, + "duration": 0.0, + "text": "you have to use RMS norm." + }, + { + "start": 878.28, + "duration": 0.0, + "text": "you have to use RMS norm. But<00:14:39.040> RMS<00:14:39.440> norm<00:14:39.640> is<00:14:39.760> nice<00:14:40.120> because<00:14:40.560> in" + }, + { + "start": 880.71, + "duration": 0.0, + "text": "But RMS norm is nice because in" + }, + { + "start": 880.72, + "duration": 0.0, + "text": "But RMS norm is nice because in practice," + }, + { + "start": 882.39, + "duration": 0.0, + "text": "practice," + }, + { + "start": 882.4, + "duration": 0.0, + "text": "practice, there's<00:14:42.640> really<00:14:42.880> no<00:14:43.120> expressiveness<00:14:43.839> loss." + }, + { + "start": 884.43, + "duration": 0.0, + "text": "there's really no expressiveness loss." + }, + { + "start": 884.44, + "duration": 0.0, + "text": "there's really no expressiveness loss. RMS<00:14:44.839> norm<00:14:45.079> models<00:14:45.600> just<00:14:45.880> as<00:14:46.040> well<00:14:46.240> as<00:14:46.400> layer" + }, + { + "start": 886.63, + "duration": 0.0, + "text": "RMS norm models just as well as layer" + }, + { + "start": 886.64, + "duration": 0.0, + "text": "RMS norm models just as well as layer norm.<00:14:47.400> But<00:14:47.520> more<00:14:47.680> importantly,<00:14:48.920> it<00:14:49.079> is,<00:14:49.440> you" + }, + { + "start": 889.51, + "duration": 0.0, + "text": "norm. But more importantly, it is, you" + }, + { + "start": 889.52, + "duration": 0.0, + "text": "norm. But more importantly, it is, you know,<00:14:49.720> faster,<00:14:50.400> right?<00:14:50.600> This<00:14:50.720> is<00:14:50.839> the<00:14:50.959> part" + }, + { + "start": 891.23, + "duration": 0.0, + "text": "know, faster, right? This is the part" + }, + { + "start": 891.24, + "duration": 0.0, + "text": "know, faster, right? This is the part where<00:14:51.400> kind<00:14:51.600> of<00:14:51.640> the<00:14:51.760> systems<00:14:52.800> and<00:14:52.959> sort<00:14:53.160> of" + }, + { + "start": 893.39, + "duration": 0.0, + "text": "where kind of the systems and sort of" + }, + { + "start": 893.4, + "duration": 0.0, + "text": "where kind of the systems and sort of architecture<00:14:53.959> co-design<00:14:54.600> starts<00:14:54.920> to<00:14:55.000> come" + }, + { + "start": 895.19, + "duration": 0.0, + "text": "architecture co-design starts to come" + }, + { + "start": 895.2, + "duration": 0.0, + "text": "architecture co-design starts to come in." + }, + { + "start": 896.11, + "duration": 0.0, + "text": "in." + }, + { + "start": 896.12, + "duration": 0.0, + "text": "in. Percy<00:14:56.600> mentioned,<00:14:57.280> you<00:14:57.400> know,<00:14:57.520> in<00:14:57.600> the" + }, + { + "start": 897.71, + "duration": 0.0, + "text": "Percy mentioned, you know, in the" + }, + { + "start": 897.72, + "duration": 0.0, + "text": "Percy mentioned, you know, in the previous<00:14:58.160> lecture,<00:14:58.440> this<00:14:58.600> idea<00:14:58.880> of" + }, + { + "start": 899.03, + "duration": 0.0, + "text": "previous lecture, this idea of" + }, + { + "start": 899.04, + "duration": 0.0, + "text": "previous lecture, this idea of arithmetic<00:14:59.480> intensity,<00:15:00.160> right?<00:15:00.360> We<00:15:00.480> want<00:15:00.640> to" + }, + { + "start": 900.71, + "duration": 0.0, + "text": "arithmetic intensity, right? We want to" + }, + { + "start": 900.72, + "duration": 0.0, + "text": "arithmetic intensity, right? We want to keep<00:15:00.959> our<00:15:01.040> GPUs<00:15:01.560> hot<00:15:02.360> by<00:15:02.520> doing,<00:15:02.920> you<00:15:03.000> know," + }, + { + "start": 903.069, + "duration": 0.0, + "text": "keep our GPUs hot by doing, you know," + }, + { + "start": 903.079, + "duration": 0.0, + "text": "keep our GPUs hot by doing, you know, matrix<00:15:03.480> multiplies<00:15:04.079> and<00:15:04.200> other<00:15:04.360> very<00:15:04.640> intense" + }, + { + "start": 905.069, + "duration": 0.0, + "text": "matrix multiplies and other very intense" + }, + { + "start": 905.079, + "duration": 0.0, + "text": "matrix multiplies and other very intense computations.<00:15:06.320> We<00:15:06.480> do<00:15:06.600> not<00:15:06.839> want<00:15:06.959> to<00:15:07.040> be" + }, + { + "start": 907.15, + "duration": 0.0, + "text": "computations. We do not want to be" + }, + { + "start": 907.16, + "duration": 0.0, + "text": "computations. We do not want to be wasting<00:15:07.560> our<00:15:07.680> GPUs<00:15:08.160> by<00:15:08.320> having<00:15:08.680> them<00:15:08.880> move" + }, + { + "start": 909.35, + "duration": 0.0, + "text": "wasting our GPUs by having them move" + }, + { + "start": 909.36, + "duration": 0.0, + "text": "wasting our GPUs by having them move little<00:15:09.640> tiny<00:15:10.040> bits<00:15:10.240> of<00:15:10.360> memory<00:15:10.760> back<00:15:11.000> and" + }, + { + "start": 911.15, + "duration": 0.0, + "text": "little tiny bits of memory back and" + }, + { + "start": 911.16, + "duration": 0.0, + "text": "little tiny bits of memory back and forth,<00:15:11.560> right?<00:15:11.720> That's<00:15:11.839> a<00:15:11.880> very<00:15:12.160> inefficient" + }, + { + "start": 912.75, + "duration": 0.0, + "text": "forth, right? That's a very inefficient" + }, + { + "start": 912.76, + "duration": 0.0, + "text": "forth, right? That's a very inefficient use<00:15:13.440> of<00:15:13.640> our,<00:15:13.920> you<00:15:14.000> know,<00:15:14.160> very<00:15:14.400> powerful<00:15:14.800> GPU." + }, + { + "start": 916.03, + "duration": 0.0, + "text": "use of our, you know, very powerful GPU." + }, + { + "start": 916.04, + "duration": 0.0, + "text": "use of our, you know, very powerful GPU. And<00:15:16.280> so,<00:15:16.400> what<00:15:16.520> we<00:15:16.680> want<00:15:17.440> is<00:15:17.640> to<00:15:17.800> remove" + }, + { + "start": 918.39, + "duration": 0.0, + "text": "And so, what we want is to remove" + }, + { + "start": 918.4, + "duration": 0.0, + "text": "And so, what we want is to remove operations<00:15:19.280> that<00:15:19.400> are<00:15:19.520> small<00:15:20.000> and<00:15:20.120> involve" + }, + { + "start": 920.51, + "duration": 0.0, + "text": "operations that are small and involve" + }, + { + "start": 920.52, + "duration": 0.0, + "text": "operations that are small and involve memory<00:15:20.839> movement,<00:15:21.920> but<00:15:22.320> don't<00:15:22.560> give<00:15:22.720> us<00:15:22.839> much" + }, + { + "start": 922.949, + "duration": 0.0, + "text": "memory movement, but don't give us much" + }, + { + "start": 922.959, + "duration": 0.0, + "text": "memory movement, but don't give us much expressive<00:15:23.480> power,<00:15:23.959> right?<00:15:24.120> So,<00:15:24.280> by<00:15:24.480> that" + }, + { + "start": 924.91, + "duration": 0.0, + "text": "expressive power, right? So, by that" + }, + { + "start": 924.92, + "duration": 0.0, + "text": "expressive power, right? So, by that view,<00:15:25.640> what<00:15:25.760> we<00:15:25.880> really<00:15:26.200> want<00:15:26.360> to<00:15:26.440> be<00:15:26.560> doing" + }, + { + "start": 926.91, + "duration": 0.0, + "text": "view, what we really want to be doing" + }, + { + "start": 926.92, + "duration": 0.0, + "text": "view, what we really want to be doing here<00:15:27.520> is,<00:15:27.839> you<00:15:27.880> know,<00:15:28.000> if<00:15:28.120> the<00:15:28.200> mean" + }, + { + "start": 928.47, + "duration": 0.0, + "text": "here is, you know, if the mean" + }, + { + "start": 928.48, + "duration": 0.0, + "text": "here is, you know, if the mean subtraction<00:15:29.560> and<00:15:29.680> addition<00:15:30.240> isn't<00:15:30.520> really" + }, + { + "start": 930.71, + "duration": 0.0, + "text": "subtraction and addition isn't really" + }, + { + "start": 930.72, + "duration": 0.0, + "text": "subtraction and addition isn't really doing<00:15:31.040> much<00:15:31.280> for<00:15:31.480> us,<00:15:31.839> just<00:15:32.000> get<00:15:32.160> rid<00:15:32.320> of<00:15:32.400> it," + }, + { + "start": 932.55, + "duration": 0.0, + "text": "doing much for us, just get rid of it," + }, + { + "start": 932.56, + "duration": 0.0, + "text": "doing much for us, just get rid of it, right?<00:15:33.280> Um" + }, + { + "start": 934.39, + "duration": 0.0, + "text": "right? Um" + }, + { + "start": 934.4, + "duration": 0.0, + "text": "right? Um you<00:15:34.520> might<00:15:34.720> think,<00:15:35.280> \"Okay,<00:15:36.160> why<00:15:36.360> does<00:15:36.520> this" + }, + { + "start": 936.67, + "duration": 0.0, + "text": "you might think, \"Okay, why does this" + }, + { + "start": 936.68, + "duration": 0.0, + "text": "you might think, \"Okay, why does this matter?<00:15:37.079> We're<00:15:37.200> just<00:15:37.440> optimizing<00:15:37.959> this" + }, + { + "start": 938.19, + "duration": 0.0, + "text": "matter? We're just optimizing this" + }, + { + "start": 938.2, + "duration": 0.0, + "text": "matter? We're just optimizing this teeny,<00:15:38.600> tiny<00:15:39.000> operation<00:15:40.000> that<00:15:40.240> accounts<00:15:40.720> for," + }, + { + "start": 941.43, + "duration": 0.0, + "text": "teeny, tiny operation that accounts for," + }, + { + "start": 941.44, + "duration": 0.0, + "text": "teeny, tiny operation that accounts for, you<00:15:41.560> know,<00:15:41.760> in<00:15:41.880> this<00:15:42.079> case,<00:15:42.640> something<00:15:42.959> like" + }, + { + "start": 943.19, + "duration": 0.0, + "text": "you know, in this case, something like" + }, + { + "start": 943.2, + "duration": 0.0, + "text": "you know, in this case, something like 0.17%" + }, + { + "start": 945.069, + "duration": 0.0, + "text": "0.17%" + }, + { + "start": 945.079, + "duration": 0.0, + "text": "0.17% of<00:15:45.320> the<00:15:45.400> total<00:15:45.800> floating<00:15:46.120> point<00:15:46.400> operations" + }, + { + "start": 947.51, + "duration": 0.0, + "text": "of the total floating point operations" + }, + { + "start": 947.52, + "duration": 0.0, + "text": "of the total floating point operations of<00:15:47.800> our<00:15:47.920> system.\"" + }, + { + "start": 949.11, + "duration": 0.0, + "text": "of our system.\"" + }, + { + "start": 949.12, + "duration": 0.0, + "text": "of our system.\" But,<00:15:49.839> you<00:15:49.920> know,<00:15:50.160> as<00:15:50.440> Percy<00:15:51.240> mentioned,<00:15:51.720> it's" + }, + { + "start": 951.87, + "duration": 0.0, + "text": "But, you know, as Percy mentioned, it's" + }, + { + "start": 951.88, + "duration": 0.0, + "text": "But, you know, as Percy mentioned, it's not<00:15:52.079> really<00:15:52.280> about<00:15:52.520> the<00:15:52.640> flops,<00:15:53.280> right?<00:15:53.440> The" + }, + { + "start": 953.55, + "duration": 0.0, + "text": "not really about the flops, right? The" + }, + { + "start": 953.56, + "duration": 0.0, + "text": "not really about the flops, right? The flops<00:15:53.959> are<00:15:54.040> the<00:15:54.400> the<00:15:54.520> floating<00:15:54.959> point" + }, + { + "start": 955.15, + "duration": 0.0, + "text": "flops are the the floating point" + }, + { + "start": 955.16, + "duration": 0.0, + "text": "flops are the the floating point operations<00:15:55.760> we<00:15:55.880> do,<00:15:56.079> that's<00:15:56.280> sort<00:15:56.400> of" + }, + { + "start": 956.51, + "duration": 0.0, + "text": "operations we do, that's sort of" + }, + { + "start": 956.52, + "duration": 0.0, + "text": "operations we do, that's sort of multiplying<00:15:57.040> matrices,<00:15:58.040> but<00:15:58.160> that's<00:15:58.320> not" + }, + { + "start": 958.55, + "duration": 0.0, + "text": "multiplying matrices, but that's not" + }, + { + "start": 958.56, + "duration": 0.0, + "text": "multiplying matrices, but that's not runtime,<00:15:59.120> right?<00:15:59.360> Runtime<00:16:00.320> is<00:16:00.600> a<00:16:00.680> much<00:16:00.880> more" + }, + { + "start": 961.03, + "duration": 0.0, + "text": "runtime, right? Runtime is a much more" + }, + { + "start": 961.04, + "duration": 0.0, + "text": "runtime, right? Runtime is a much more complicated<00:16:01.680> object.<00:16:02.640> And,<00:16:03.440> you<00:16:03.520> know," + }, + { + "start": 963.63, + "duration": 0.0, + "text": "complicated object. And, you know," + }, + { + "start": 963.64, + "duration": 0.0, + "text": "complicated object. And, you know, statistical<00:16:04.240> normalizations,<00:16:05.160> things<00:16:05.440> like" + }, + { + "start": 965.59, + "duration": 0.0, + "text": "statistical normalizations, things like" + }, + { + "start": 965.6, + "duration": 0.0, + "text": "statistical normalizations, things like layer<00:16:05.880> norms,<00:16:06.600> even<00:16:06.839> though<00:16:06.959> they're<00:16:07.079> only" + }, + { + "start": 967.23, + "duration": 0.0, + "text": "layer norms, even though they're only" + }, + { + "start": 967.24, + "duration": 0.0, + "text": "layer norms, even though they're only 0.17%<00:16:08.200> of<00:16:08.280> the<00:16:08.360> flops,<00:16:09.240> depending<00:16:09.800> on<00:16:09.920> your" + }, + { + "start": 970.03, + "duration": 0.0, + "text": "0.17% of the flops, depending on your" + }, + { + "start": 970.04, + "duration": 0.0, + "text": "0.17% of the flops, depending on your workload<00:16:10.440> and<00:16:10.520> depending<00:16:10.959> on<00:16:11.079> the<00:16:11.200> setup,<00:16:11.760> can" + }, + { + "start": 971.87, + "duration": 0.0, + "text": "workload and depending on the setup, can" + }, + { + "start": 971.88, + "duration": 0.0, + "text": "workload and depending on the setup, can be<00:16:12.079> up<00:16:12.240> to<00:16:12.360> 25%<00:16:13.240> of<00:16:13.280> the<00:16:13.400> runtime,<00:16:13.839> right?" + }, + { + "start": 973.949, + "duration": 0.0, + "text": "be up to 25% of the runtime, right?" + }, + { + "start": 973.959, + "duration": 0.0, + "text": "be up to 25% of the runtime, right? That's<00:16:14.120> kind<00:16:14.400> of<00:16:14.520> crazy.<00:16:14.920> On<00:16:15.079> tiny<00:16:15.440> models," + }, + { + "start": 975.75, + "duration": 0.0, + "text": "That's kind of crazy. On tiny models," + }, + { + "start": 975.76, + "duration": 0.0, + "text": "That's kind of crazy. On tiny models, this<00:16:15.880> can<00:16:16.000> be<00:16:16.120> really,<00:16:16.360> really<00:16:16.600> big<00:16:17.400> because" + }, + { + "start": 977.75, + "duration": 0.0, + "text": "this can be really, really big because" + }, + { + "start": 977.76, + "duration": 0.0, + "text": "this can be really, really big because you're<00:16:17.880> still<00:16:18.160> having<00:16:18.480> to<00:16:18.600> move<00:16:19.040> all<00:16:19.160> these" + }, + { + "start": 979.35, + "duration": 0.0, + "text": "you're still having to move all these" + }, + { + "start": 979.36, + "duration": 0.0, + "text": "you're still having to move all these parameters<00:16:19.839> back<00:16:20.079> and<00:16:20.200> forth<00:16:20.560> from<00:16:20.760> fast<00:16:21.160> to" + }, + { + "start": 981.23, + "duration": 0.0, + "text": "parameters back and forth from fast to" + }, + { + "start": 981.24, + "duration": 0.0, + "text": "parameters back and forth from fast to slow<00:16:21.520> memory<00:16:21.880> and<00:16:22.000> vice<00:16:22.240> versa" + }, + { + "start": 983.35, + "duration": 0.0, + "text": "slow memory and vice versa" + }, + { + "start": 983.36, + "duration": 0.0, + "text": "slow memory and vice versa when<00:16:23.480> you're<00:16:23.600> doing<00:16:23.839> these<00:16:24.040> operations.<00:16:24.680> So," + }, + { + "start": 985.19, + "duration": 0.0, + "text": "when you're doing these operations. So," + }, + { + "start": 985.2, + "duration": 0.0, + "text": "when you're doing these operations. So, data<00:16:25.480> movement<00:16:25.839> is<00:16:26.000> really,<00:16:26.240> really" + }, + { + "start": 986.51, + "duration": 0.0, + "text": "data movement is really, really" + }, + { + "start": 986.52, + "duration": 0.0, + "text": "data movement is really, really important,<00:16:27.680> and<00:16:27.839> RMS<00:16:28.240> norm<00:16:28.480> can<00:16:28.640> still<00:16:28.959> matter" + }, + { + "start": 989.27, + "duration": 0.0, + "text": "important, and RMS norm can still matter" + }, + { + "start": 989.28, + "duration": 0.0, + "text": "important, and RMS norm can still matter a<00:16:29.320> lot<00:16:29.680> because<00:16:30.160> of<00:16:30.280> this,<00:16:30.680> right?<00:16:31.000> So," + }, + { + "start": 992.069, + "duration": 0.0, + "text": "a lot because of this, right? So," + }, + { + "start": 992.079, + "duration": 0.0, + "text": "a lot because of this, right? So, um" + }, + { + "start": 992.63, + "duration": 0.0, + "text": "um" + }, + { + "start": 992.64, + "duration": 0.0, + "text": "um you<00:16:32.839> can<00:16:32.959> see<00:16:33.079> kind<00:16:33.240> of<00:16:33.320> the<00:16:33.400> difference<00:16:33.720> here." + }, + { + "start": 994.949, + "duration": 0.0, + "text": "you can see kind of the difference here." + }, + { + "start": 994.959, + "duration": 0.0, + "text": "you can see kind of the difference here. The<00:16:35.160> arithmetic<00:16:35.600> intensity<00:16:36.240> is<00:16:36.440> in<00:16:36.680> white," + }, + { + "start": 997.31, + "duration": 0.0, + "text": "The arithmetic intensity is in white," + }, + { + "start": 997.32, + "duration": 0.0, + "text": "The arithmetic intensity is in white, and<00:16:37.440> then<00:16:37.560> you<00:16:37.640> can<00:16:37.760> kind<00:16:38.000> of<00:16:38.079> see<00:16:38.600> the<00:16:38.720> flops" + }, + { + "start": 999.19, + "duration": 0.0, + "text": "and then you can kind of see the flops" + }, + { + "start": 999.2, + "duration": 0.0, + "text": "and then you can kind of see the flops involved<00:16:39.720> in<00:16:39.880> black.<00:16:40.280> And<00:16:40.400> you<00:16:40.480> see<00:16:40.600> that" + }, + { + "start": 1000.79, + "duration": 0.0, + "text": "involved in black. And you see that" + }, + { + "start": 1000.8, + "duration": 0.0, + "text": "involved in black. And you see that layer<00:16:41.079> norm<00:16:41.400> has<00:16:41.560> a<00:16:41.640> very<00:16:42.079> low<00:16:42.240> arithmetic" + }, + { + "start": 1002.67, + "duration": 0.0, + "text": "layer norm has a very low arithmetic" + }, + { + "start": 1002.68, + "duration": 0.0, + "text": "layer norm has a very low arithmetic intensity,<00:16:43.560> which<00:16:43.720> is<00:16:43.800> the<00:16:43.920> operation<00:16:44.360> we<00:16:44.440> try" + }, + { + "start": 1004.59, + "duration": 0.0, + "text": "intensity, which is the operation we try" + }, + { + "start": 1004.6, + "duration": 0.0, + "text": "intensity, which is the operation we try to<00:16:44.720> want<00:16:44.880> to<00:16:45.079> remove<00:16:45.640> as<00:16:45.760> much<00:16:45.920> as<00:16:46.040> possible." + }, + { + "start": 1006.39, + "duration": 0.0, + "text": "to want to remove as much as possible." + }, + { + "start": 1006.4, + "duration": 0.0, + "text": "to want to remove as much as possible. Yeah,<00:16:46.560> question<00:16:46.959> over<00:16:47.120> there." + }, + { + "start": 1008.47, + "duration": 0.0, + "text": "Yeah, question over there." + }, + { + "start": 1008.48, + "duration": 0.0, + "text": "Yeah, question over there. Data<00:16:48.760> movement<00:16:49.280> for<00:16:49.800> normalization<00:16:50.640> is<00:16:50.720> so" + }, + { + "start": 1010.91, + "duration": 0.0, + "text": "Data movement for normalization is so" + }, + { + "start": 1010.92, + "duration": 0.0, + "text": "Data movement for normalization is so disproportionate<00:16:51.720> compared<00:16:52.120> to<00:16:52.240> arithmetic" + }, + { + "start": 1012.71, + "duration": 0.0, + "text": "disproportionate compared to arithmetic" + }, + { + "start": 1012.72, + "duration": 0.0, + "text": "disproportionate compared to arithmetic contraction" + }, + { + "start": 1014.51, + "duration": 0.0, + "text": "contraction" + }, + { + "start": 1014.52, + "duration": 0.0, + "text": "contraction So,<00:16:55.280> for<00:16:55.440> a<00:16:55.480> ten-<00:16:55.760> Something<00:16:56.000> like<00:16:56.120> tensor" + }, + { + "start": 1016.43, + "duration": 0.0, + "text": "So, for a ten- Something like tensor" + }, + { + "start": 1016.44, + "duration": 0.0, + "text": "So, for a ten- Something like tensor contraction,<00:16:57.280> which<00:16:57.440> is,<00:16:57.640> in<00:16:57.720> this<00:16:57.839> case," + }, + { + "start": 1018.23, + "duration": 0.0, + "text": "contraction, which is, in this case," + }, + { + "start": 1018.24, + "duration": 0.0, + "text": "contraction, which is, in this case, matrix<00:16:58.560> multiplies," + }, + { + "start": 1019.91, + "duration": 0.0, + "text": "matrix multiplies," + }, + { + "start": 1019.92, + "duration": 0.0, + "text": "matrix multiplies, the<00:17:00.040> majority<00:17:00.640> of<00:17:00.720> the<00:17:00.800> workload<00:17:01.240> is,<00:17:01.920> you" + }, + { + "start": 1021.99, + "duration": 0.0, + "text": "the majority of the workload is, you" + }, + { + "start": 1022.0, + "duration": 0.0, + "text": "the majority of the workload is, you know,<00:17:02.360> multiplying.<00:17:03.320> Whereas<00:17:03.720> for<00:17:04.199> stat" + }, + { + "start": 1024.51, + "duration": 0.0, + "text": "know, multiplying. Whereas for stat" + }, + { + "start": 1024.52, + "duration": 0.0, + "text": "know, multiplying. Whereas for stat normalization,<00:17:05.079> the<00:17:05.160> majority<00:17:05.560> of<00:17:05.600> the" + }, + { + "start": 1025.669, + "duration": 0.0, + "text": "normalization, the majority of the" + }, + { + "start": 1025.679, + "duration": 0.0, + "text": "normalization, the majority of the workload<00:17:06.079> is<00:17:06.240> memory<00:17:06.600> movement.<00:17:07.079> And<00:17:07.160> memory" + }, + { + "start": 1027.429, + "duration": 0.0, + "text": "workload is memory movement. And memory" + }, + { + "start": 1027.439, + "duration": 0.0, + "text": "workload is memory movement. And memory movement<00:17:07.760> is<00:17:07.880> quite<00:17:08.120> slow.<00:17:08.520> So,<00:17:08.640> imagine<00:17:09.000> the" + }, + { + "start": 1029.069, + "duration": 0.0, + "text": "movement is quite slow. So, imagine the" + }, + { + "start": 1029.079, + "duration": 0.0, + "text": "movement is quite slow. So, imagine the case<00:17:09.320> where<00:17:09.439> like<00:17:09.880> moving<00:17:10.280> something<00:17:10.640> is<00:17:10.800> like" + }, + { + "start": 1031.51, + "duration": 0.0, + "text": "case where like moving something is like" + }, + { + "start": 1031.52, + "duration": 0.0, + "text": "case where like moving something is like almost<00:17:11.880> all<00:17:12.079> of<00:17:12.120> the<00:17:12.199> compute,<00:17:13.040> then<00:17:13.160> you're" + }, + { + "start": 1033.27, + "duration": 0.0, + "text": "almost all of the compute, then you're" + }, + { + "start": 1033.28, + "duration": 0.0, + "text": "almost all of the compute, then you're still<00:17:13.520> paying<00:17:13.720> quite<00:17:13.959> a<00:17:14.040> bit<00:17:14.280> here,<00:17:14.439> right?" + }, + { + "start": 1034.59, + "duration": 0.0, + "text": "still paying quite a bit here, right?" + }, + { + "start": 1034.6, + "duration": 0.0, + "text": "still paying quite a bit here, right? Cuz<00:17:14.720> activations<00:17:15.320> can<00:17:15.439> be<00:17:15.520> quite<00:17:15.760> large." + }, + { + "start": 1038.36, + "duration": 0.0, + "text": "Yeah,<00:17:18.560> I<00:17:18.600> think<00:17:18.800> the<00:17:18.880> percent<00:17:19.280> runtime<00:17:19.640> in" + }, + { + "start": 1039.71, + "duration": 0.0, + "text": "Yeah, I think the percent runtime in" + }, + { + "start": 1039.72, + "duration": 0.0, + "text": "Yeah, I think the percent runtime in this<00:17:19.880> case<00:17:20.079> is<00:17:20.199> quite<00:17:20.400> extreme.<00:17:21.079> This<00:17:21.240> is<00:17:21.320> like" + }, + { + "start": 1041.47, + "duration": 0.0, + "text": "this case is quite extreme. This is like" + }, + { + "start": 1041.48, + "duration": 0.0, + "text": "this case is quite extreme. This is like in<00:17:21.600> the,<00:17:22.079> you<00:17:22.160> know,<00:17:22.240> tiny<00:17:22.679> models<00:17:23.199> with<00:17:23.360> like" + }, + { + "start": 1043.51, + "duration": 0.0, + "text": "in the, you know, tiny models with like" + }, + { + "start": 1043.52, + "duration": 0.0, + "text": "in the, you know, tiny models with like matrices<00:17:24.000> that<00:17:24.160> don't<00:17:24.360> really<00:17:24.640> generally" + }, + { + "start": 1045.03, + "duration": 0.0, + "text": "matrices that don't really generally" + }, + { + "start": 1045.04, + "duration": 0.0, + "text": "matrices that don't really generally make<00:17:25.280> sense<00:17:25.520> in<00:17:25.600> modern<00:17:25.920> workloads,<00:17:26.400> but<00:17:26.520> this" + }, + { + "start": 1046.669, + "duration": 0.0, + "text": "make sense in modern workloads, but this" + }, + { + "start": 1046.679, + "duration": 0.0, + "text": "make sense in modern workloads, but this is,<00:17:27.120> you<00:17:27.199> know,<00:17:27.360> giving<00:17:27.600> you<00:17:27.679> a<00:17:27.760> sense<00:17:28.040> of<00:17:28.280> why" + }, + { + "start": 1048.59, + "duration": 0.0, + "text": "is, you know, giving you a sense of why" + }, + { + "start": 1048.6, + "duration": 0.0, + "text": "is, you know, giving you a sense of why this<00:17:28.800> is<00:17:28.920> a<00:17:28.960> free<00:17:29.280> optimization." + }, + { + "start": 1050.99, + "duration": 0.0, + "text": "this is a free optimization." + }, + { + "start": 1051.0, + "duration": 0.0, + "text": "this is a free optimization. Um<00:17:31.679> and<00:17:31.800> you<00:17:31.880> do<00:17:32.040> see<00:17:32.240> this,<00:17:32.520> right?<00:17:32.679> This<00:17:32.840> is" + }, + { + "start": 1052.91, + "duration": 0.0, + "text": "Um and you do see this, right? This is" + }, + { + "start": 1052.92, + "duration": 0.0, + "text": "Um and you do see this, right? This is another<00:17:33.320> paper<00:17:33.679> in<00:17:33.800> which<00:17:34.200> people<00:17:34.480> were" + }, + { + "start": 1055.07, + "duration": 0.0, + "text": "another paper in which people were" + }, + { + "start": 1055.08, + "duration": 0.0, + "text": "another paper in which people were evaluating<00:17:35.600> different<00:17:35.920> architecture" + }, + { + "start": 1056.39, + "duration": 0.0, + "text": "evaluating different architecture" + }, + { + "start": 1056.4, + "duration": 0.0, + "text": "evaluating different architecture interventions.<00:17:37.000> They're<00:17:37.120> on<00:17:37.280> Get<00:17:37.400> All<00:17:37.520> in" + }, + { + "start": 1057.59, + "duration": 0.0, + "text": "interventions. They're on Get All in" + }, + { + "start": 1057.6, + "duration": 0.0, + "text": "interventions. They're on Get All in 2020." + }, + { + "start": 1058.91, + "duration": 0.0, + "text": "2020." + }, + { + "start": 1058.92, + "duration": 0.0, + "text": "2020. I<00:17:39.000> think<00:17:39.160> this<00:17:39.280> was<00:17:39.400> a<00:17:39.440> Google<00:17:39.760> paper,<00:17:40.160> and" + }, + { + "start": 1060.27, + "duration": 0.0, + "text": "I think this was a Google paper, and" + }, + { + "start": 1060.28, + "duration": 0.0, + "text": "I think this was a Google paper, and they<00:17:40.400> show,<00:17:40.640> you<00:17:40.720> know,<00:17:40.800> for<00:17:41.040> teeny,<00:17:41.320> tiny" + }, + { + "start": 1061.55, + "duration": 0.0, + "text": "they show, you know, for teeny, tiny" + }, + { + "start": 1061.56, + "duration": 0.0, + "text": "they show, you know, for teeny, tiny transformer<00:17:42.080> of<00:17:42.200> a<00:17:42.240> 200<00:17:42.679> million<00:17:42.920> parameters," + }, + { + "start": 1063.75, + "duration": 0.0, + "text": "transformer of a 200 million parameters," + }, + { + "start": 1063.76, + "duration": 0.0, + "text": "transformer of a 200 million parameters, you<00:17:43.880> got<00:17:44.040> more<00:17:44.280> steps<00:17:44.600> per<00:17:44.720> second.<00:17:45.120> That's" + }, + { + "start": 1065.31, + "duration": 0.0, + "text": "you got more steps per second. That's" + }, + { + "start": 1065.32, + "duration": 0.0, + "text": "you got more steps per second. That's the<00:17:45.400> third<00:17:45.679> column<00:17:45.960> over<00:17:46.120> here<00:17:46.360> when<00:17:46.480> you" + }, + { + "start": 1066.51, + "duration": 0.0, + "text": "the third column over here when you" + }, + { + "start": 1066.52, + "duration": 0.0, + "text": "the third column over here when you switch<00:17:46.760> to<00:17:46.880> RMS<00:17:47.240> norm.<00:17:48.040> And<00:17:48.160> in<00:17:48.240> fact,<00:17:48.520> you" + }, + { + "start": 1068.59, + "duration": 0.0, + "text": "switch to RMS norm. And in fact, you" + }, + { + "start": 1068.6, + "duration": 0.0, + "text": "switch to RMS norm. And in fact, you actually<00:17:48.840> get<00:17:49.040> better<00:17:49.240> performance,<00:17:49.800> which<00:17:49.960> I" + }, + { + "start": 1069.99, + "duration": 0.0, + "text": "actually get better performance, which I" + }, + { + "start": 1070.0, + "duration": 0.0, + "text": "actually get better performance, which I don't<00:17:50.240> think<00:17:50.400> is<00:17:50.800> something<00:17:51.080> that<00:17:51.240> you're" + }, + { + "start": 1071.35, + "duration": 0.0, + "text": "don't think is something that you're" + }, + { + "start": 1071.36, + "duration": 0.0, + "text": "don't think is something that you're guaranteed,<00:17:52.560> but<00:17:52.720> it's<00:17:52.840> a<00:17:52.880> nice<00:17:53.120> bonus" + }, + { + "start": 1073.47, + "duration": 0.0, + "text": "guaranteed, but it's a nice bonus" + }, + { + "start": 1073.48, + "duration": 0.0, + "text": "guaranteed, but it's a nice bonus regardless,<00:17:54.120> right?<00:17:54.440> So,<00:17:54.920> you<00:17:55.000> got<00:17:55.120> a<00:17:55.160> free" + }, + { + "start": 1075.35, + "duration": 0.0, + "text": "regardless, right? So, you got a free" + }, + { + "start": 1075.36, + "duration": 0.0, + "text": "regardless, right? So, you got a free systems<00:17:55.840> win<00:17:56.679> by<00:17:56.760> just<00:17:57.000> moving<00:17:57.280> to<00:17:57.360> RMS<00:17:57.720> norm." + }, + { + "start": 1077.91, + "duration": 0.0, + "text": "systems win by just moving to RMS norm." + }, + { + "start": 1077.92, + "duration": 0.0, + "text": "systems win by just moving to RMS norm. And<00:17:58.000> so,<00:17:58.120> basically,<00:17:58.480> everyone<00:17:58.800> has<00:17:59.040> has" + }, + { + "start": 1079.23, + "duration": 0.0, + "text": "And so, basically, everyone has has" + }, + { + "start": 1079.24, + "duration": 0.0, + "text": "And so, basically, everyone has has decided<00:17:59.800> to<00:17:59.880> move<00:18:00.120> over<00:18:00.760> to<00:18:00.800> this<00:18:01.040> now." + }, + { + "start": 1082.59, + "duration": 0.0, + "text": "decided to move over to this now." + }, + { + "start": 1082.6, + "duration": 0.0, + "text": "decided to move over to this now. And<00:18:02.840> in<00:18:03.000> general," + }, + { + "start": 1084.47, + "duration": 0.0, + "text": "And in general," + }, + { + "start": 1084.48, + "duration": 0.0, + "text": "And in general, there's<00:18:04.720> a<00:18:04.760> more<00:18:05.360> general<00:18:05.800> version<00:18:06.160> of<00:18:06.280> this." + }, + { + "start": 1086.55, + "duration": 0.0, + "text": "there's a more general version of this." + }, + { + "start": 1086.56, + "duration": 0.0, + "text": "there's a more general version of this. There's<00:18:06.720> no<00:18:06.840> guarantee<00:18:07.400> to<00:18:07.480> any<00:18:07.640> of<00:18:07.720> the" + }, + { + "start": 1087.79, + "duration": 0.0, + "text": "There's no guarantee to any of the" + }, + { + "start": 1087.8, + "duration": 0.0, + "text": "There's no guarantee to any of the things<00:18:08.040> I'm<00:18:08.160> saying,<00:18:08.560> but<00:18:09.080> bias<00:18:09.560> terms<00:18:10.480> in" + }, + { + "start": 1090.79, + "duration": 0.0, + "text": "things I'm saying, but bias terms in" + }, + { + "start": 1090.8, + "duration": 0.0, + "text": "things I'm saying, but bias terms in transformers<00:18:11.400> and<00:18:11.520> neural<00:18:11.760> networks" + }, + { + "start": 1092.87, + "duration": 0.0, + "text": "transformers and neural networks" + }, + { + "start": 1092.88, + "duration": 0.0, + "text": "transformers and neural networks are<00:18:13.040> generally<00:18:13.480> not<00:18:14.040> that<00:18:14.360> useful.<00:18:14.840> So,<00:18:14.920> in" + }, + { + "start": 1094.95, + "duration": 0.0, + "text": "are generally not that useful. So, in" + }, + { + "start": 1094.96, + "duration": 0.0, + "text": "are generally not that useful. So, in the<00:18:15.040> original<00:18:15.360> transformer,<00:18:15.920> the<00:18:16.080> linear" + }, + { + "start": 1096.43, + "duration": 0.0, + "text": "the original transformer, the linear" + }, + { + "start": 1096.44, + "duration": 0.0, + "text": "the original transformer, the linear terms<00:18:17.120> all<00:18:17.320> have<00:18:17.520> biases," + }, + { + "start": 1098.99, + "duration": 0.0, + "text": "terms all have biases," + }, + { + "start": 1099.0, + "duration": 0.0, + "text": "terms all have biases, but<00:18:19.240> most<00:18:19.640> implementations<00:18:20.880> actually<00:18:21.120> just" + }, + { + "start": 1101.27, + "duration": 0.0, + "text": "but most implementations actually just" + }, + { + "start": 1101.28, + "duration": 0.0, + "text": "but most implementations actually just drop<00:18:21.640> the<00:18:21.720> biases<00:18:22.200> entirely,<00:18:22.800> right?<00:18:23.000> Once" + }, + { + "start": 1103.19, + "duration": 0.0, + "text": "drop the biases entirely, right? Once" + }, + { + "start": 1103.2, + "duration": 0.0, + "text": "drop the biases entirely, right? Once again,<00:18:23.440> this<00:18:23.600> is<00:18:23.679> another<00:18:24.000> example<00:18:24.880> of" + }, + { + "start": 1105.07, + "duration": 0.0, + "text": "again, this is another example of" + }, + { + "start": 1105.08, + "duration": 0.0, + "text": "again, this is another example of something<00:18:25.400> that's<00:18:25.600> not<00:18:25.840> very<00:18:26.120> arithmetically" + }, + { + "start": 1106.71, + "duration": 0.0, + "text": "something that's not very arithmetically" + }, + { + "start": 1106.72, + "duration": 0.0, + "text": "something that's not very arithmetically intense,<00:18:27.120> but<00:18:27.320> fairly<00:18:27.679> memory" + }, + { + "start": 1108.87, + "duration": 0.0, + "text": "intense, but fairly memory" + }, + { + "start": 1108.88, + "duration": 0.0, + "text": "intense, but fairly memory intensive,<00:18:29.720> relatively<00:18:30.200> speaking.<00:18:31.040> And<00:18:31.159> so," + }, + { + "start": 1111.23, + "duration": 0.0, + "text": "intensive, relatively speaking. And so," + }, + { + "start": 1111.24, + "duration": 0.0, + "text": "intensive, relatively speaking. And so, you<00:18:31.360> might<00:18:31.560> as<00:18:31.640> well<00:18:31.760> just<00:18:31.919> drop<00:18:32.240> these," + }, + { + "start": 1112.87, + "duration": 0.0, + "text": "you might as well just drop these," + }, + { + "start": 1112.88, + "duration": 0.0, + "text": "you might as well just drop these, right?<00:18:33.159> And<00:18:33.320> get<00:18:33.480> the<00:18:33.600> free<00:18:33.760> systems<00:18:34.240> win." + }, + { + "start": 1115.47, + "duration": 0.0, + "text": "right? And get the free systems win." + }, + { + "start": 1115.48, + "duration": 0.0, + "text": "right? And get the free systems win. There's<00:18:35.679> also<00:18:35.919> some<00:18:36.280> cases,<00:18:36.679> I'll<00:18:36.760> just" + }, + { + "start": 1116.909, + "duration": 0.0, + "text": "There's also some cases, I'll just" + }, + { + "start": 1116.919, + "duration": 0.0, + "text": "There's also some cases, I'll just mention<00:18:37.240> this<00:18:37.440> offhand,<00:18:37.919> where<00:18:38.040> the<00:18:38.159> bias" + }, + { + "start": 1118.59, + "duration": 0.0, + "text": "mention this offhand, where the bias" + }, + { + "start": 1118.6, + "duration": 0.0, + "text": "mention this offhand, where the bias terms<00:18:38.880> can<00:18:39.000> also<00:18:39.240> induce<00:18:39.560> stability<00:18:40.159> issues." + }, + { + "start": 1120.87, + "duration": 0.0, + "text": "terms can also induce stability issues." + }, + { + "start": 1120.88, + "duration": 0.0, + "text": "terms can also induce stability issues. So,<00:18:41.000> they're<00:18:41.159> useful<00:18:41.560> in<00:18:41.720> other<00:18:42.000> ways,<00:18:42.640> but" + }, + { + "start": 1122.83, + "duration": 0.0, + "text": "So, they're useful in other ways, but" + }, + { + "start": 1122.84, + "duration": 0.0, + "text": "So, they're useful in other ways, but really,<00:18:43.120> I<00:18:43.200> think<00:18:43.360> the<00:18:43.440> primary<00:18:43.919> reason<00:18:44.240> these" + }, + { + "start": 1124.39, + "duration": 0.0, + "text": "really, I think the primary reason these" + }, + { + "start": 1124.4, + "duration": 0.0, + "text": "really, I think the primary reason these are<00:18:44.520> dropped<00:18:44.880> is<00:18:45.040> just<00:18:45.280> to<00:18:45.360> simplify<00:18:45.880> things" + }, + { + "start": 1126.59, + "duration": 0.0, + "text": "are dropped is just to simplify things" + }, + { + "start": 1126.6, + "duration": 0.0, + "text": "are dropped is just to simplify things from<00:18:46.760> the<00:18:46.800> systems<00:18:47.240> perspective." + }, + { + "start": 1129.31, + "duration": 0.0, + "text": "from the systems perspective." + }, + { + "start": 1129.32, + "duration": 0.0, + "text": "from the systems perspective. Cool." + }, + { + "start": 1131.63, + "duration": 0.0, + "text": "Cool." + }, + { + "start": 1131.64, + "duration": 0.0, + "text": "Cool. Okay.<00:18:52.480> So,<00:18:52.560> I<00:18:52.600> think<00:18:52.840> layer<00:18:53.159> norms,<00:18:53.560> the<00:18:53.679> story" + }, + { + "start": 1134.11, + "duration": 0.0, + "text": "Okay. So, I think layer norms, the story" + }, + { + "start": 1134.12, + "duration": 0.0, + "text": "Okay. So, I think layer norms, the story is<00:18:54.280> pretty<00:18:54.720> easy." + }, + { + "start": 1135.59, + "duration": 0.0, + "text": "is pretty easy." + }, + { + "start": 1135.6, + "duration": 0.0, + "text": "is pretty easy. Um<00:18:55.679> it's<00:18:55.880> easy<00:18:56.200> in<00:18:56.320> the<00:18:56.440> sense<00:18:56.679> that<00:18:57.040> what" + }, + { + "start": 1137.19, + "duration": 0.0, + "text": "Um it's easy in the sense that what" + }, + { + "start": 1137.2, + "duration": 0.0, + "text": "Um it's easy in the sense that what people<00:18:57.600> do<00:18:57.840> is<00:18:58.000> fairly<00:18:58.400> standardized.<00:18:59.360> Our" + }, + { + "start": 1139.669, + "duration": 0.0, + "text": "people do is fairly standardized. Our" + }, + { + "start": 1139.679, + "duration": 0.0, + "text": "people do is fairly standardized. Our our<00:18:59.960> understanding,<00:19:00.880> not<00:19:01.120> at<00:19:01.240> like<00:19:01.400> a<00:19:01.440> deep" + }, + { + "start": 1141.79, + "duration": 0.0, + "text": "our understanding, not at like a deep" + }, + { + "start": 1141.8, + "duration": 0.0, + "text": "our understanding, not at like a deep theoretical<00:19:02.320> level,<00:19:02.560> but<00:19:02.679> our<00:19:02.800> understanding" + }, + { + "start": 1143.35, + "duration": 0.0, + "text": "theoretical level, but our understanding" + }, + { + "start": 1143.36, + "duration": 0.0, + "text": "theoretical level, but our understanding of<00:19:03.480> like<00:19:04.080> what<00:19:04.280> layer<00:19:04.520> norm<00:19:05.159> does<00:19:05.919> is<00:19:06.080> fairly" + }, + { + "start": 1146.51, + "duration": 0.0, + "text": "of like what layer norm does is fairly" + }, + { + "start": 1146.52, + "duration": 0.0, + "text": "of like what layer norm does is fairly good,<00:19:06.880> right?<00:19:07.400> Everyone<00:19:07.880> moves<00:19:08.200> the<00:19:08.320> layer" + }, + { + "start": 1148.47, + "duration": 0.0, + "text": "good, right? Everyone moves the layer" + }, + { + "start": 1148.48, + "duration": 0.0, + "text": "good, right? Everyone moves the layer norm<00:19:08.800> outside<00:19:09.120> the<00:19:09.200> residual<00:19:09.679> stream," + }, + { + "start": 1150.95, + "duration": 0.0, + "text": "norm outside the residual stream," + }, + { + "start": 1150.96, + "duration": 0.0, + "text": "norm outside the residual stream, often<00:19:11.360> pre<00:19:11.560> norm,<00:19:11.880> but<00:19:12.000> I<00:19:12.040> think<00:19:12.240> this<00:19:12.400> might" + }, + { + "start": 1152.59, + "duration": 0.0, + "text": "often pre norm, but I think this might" + }, + { + "start": 1152.6, + "duration": 0.0, + "text": "often pre norm, but I think this might partially<00:19:13.120> be<00:19:13.440> be<00:19:13.600> because<00:19:13.880> Llama<00:19:14.159> 2<00:19:14.360> did" + }, + { + "start": 1154.55, + "duration": 0.0, + "text": "partially be be because Llama 2 did" + }, + { + "start": 1154.56, + "duration": 0.0, + "text": "partially be be because Llama 2 did that.<00:19:15.320> Um<00:19:16.200> and<00:19:16.360> we<00:19:16.840> roughly<00:19:17.159> have<00:19:17.320> a<00:19:17.360> sense<00:19:17.600> of" + }, + { + "start": 1157.75, + "duration": 0.0, + "text": "that. Um and we roughly have a sense of" + }, + { + "start": 1157.76, + "duration": 0.0, + "text": "that. Um and we roughly have a sense of how<00:19:18.000> to<00:19:18.159> use<00:19:18.360> layer<00:19:18.640> norm<00:19:18.880> to<00:19:19.000> control<00:19:19.400> things" + }, + { + "start": 1159.71, + "duration": 0.0, + "text": "how to use layer norm to control things" + }, + { + "start": 1159.72, + "duration": 0.0, + "text": "how to use layer norm to control things like" + }, + { + "start": 1160.83, + "duration": 0.0, + "text": "like" + }, + { + "start": 1160.84, + "duration": 0.0, + "text": "like gradient<00:19:21.240> spikes" + }, + { + "start": 1162.79, + "duration": 0.0, + "text": "gradient spikes" + }, + { + "start": 1162.8, + "duration": 0.0, + "text": "gradient spikes and<00:19:23.040> keep<00:19:23.280> signal<00:19:23.600> propagation<00:19:24.200> nice." + }, + { + "start": 1165.43, + "duration": 0.0, + "text": "and keep signal propagation nice." + }, + { + "start": 1165.44, + "duration": 0.0, + "text": "and keep signal propagation nice. Related<00:19:26.040> to<00:19:26.159> that,<00:19:26.520> we<00:19:26.679> also<00:19:26.960> now,<00:19:27.480> you<00:19:27.600> know," + }, + { + "start": 1167.71, + "duration": 0.0, + "text": "Related to that, we also now, you know," + }, + { + "start": 1167.72, + "duration": 0.0, + "text": "Related to that, we also now, you know, basically<00:19:28.080> always<00:19:28.360> use<00:19:28.560> RMS<00:19:28.919> norm,<00:19:29.320> and<00:19:29.400> you" + }, + { + "start": 1169.47, + "duration": 0.0, + "text": "basically always use RMS norm, and you" + }, + { + "start": 1169.48, + "duration": 0.0, + "text": "basically always use RMS norm, and you hopefully<00:19:29.840> understand<00:19:30.280> the<00:19:30.360> general" + }, + { + "start": 1170.71, + "duration": 0.0, + "text": "hopefully understand the general" + }, + { + "start": 1170.72, + "duration": 0.0, + "text": "hopefully understand the general principles<00:19:31.280> here" + }, + { + "start": 1172.31, + "duration": 0.0, + "text": "principles here" + }, + { + "start": 1172.32, + "duration": 0.0, + "text": "principles here of<00:19:32.720> basically<00:19:32.960> just<00:19:33.159> dropping<00:19:33.800> bias<00:19:34.280> terms." + }, + { + "start": 1174.83, + "duration": 0.0, + "text": "of basically just dropping bias terms." + }, + { + "start": 1174.84, + "duration": 0.0, + "text": "of basically just dropping bias terms. And<00:19:34.919> that<00:19:35.120> allows<00:19:35.520> us<00:19:35.679> to<00:19:36.040> to<00:19:36.159> keep<00:19:36.560> our<00:19:36.840> system" + }, + { + "start": 1177.23, + "duration": 0.0, + "text": "And that allows us to to keep our system" + }, + { + "start": 1177.24, + "duration": 0.0, + "text": "And that allows us to to keep our system more<00:19:37.480> arithmetically<00:19:38.120> intense" + }, + { + "start": 1179.59, + "duration": 0.0, + "text": "more arithmetically intense" + }, + { + "start": 1179.6, + "duration": 0.0, + "text": "more arithmetically intense while<00:19:39.800> keeping<00:19:40.120> the<00:19:40.200> expressive<00:19:40.679> power<00:19:40.919> the" + }, + { + "start": 1180.99, + "duration": 0.0, + "text": "while keeping the expressive power the" + }, + { + "start": 1181.0, + "duration": 0.0, + "text": "while keeping the expressive power the same." + }, + { + "start": 1182.19, + "duration": 0.0, + "text": "same." + }, + { + "start": 1182.2, + "duration": 0.0, + "text": "same. Um<00:19:43.120> I<00:19:43.200> think<00:19:43.360> the<00:19:43.440> unsatisfying<00:19:44.159> thing<00:19:44.320> about" + }, + { + "start": 1184.59, + "duration": 0.0, + "text": "Um I think the unsatisfying thing about" + }, + { + "start": 1184.6, + "duration": 0.0, + "text": "Um I think the unsatisfying thing about a<00:19:44.640> lot<00:19:44.840> of<00:19:44.960> architectures<00:19:45.960> is<00:19:46.120> that,<00:19:46.600> you" + }, + { + "start": 1186.669, + "duration": 0.0, + "text": "a lot of architectures is that, you" + }, + { + "start": 1186.679, + "duration": 0.0, + "text": "a lot of architectures is that, you know,<00:19:46.919> you<00:19:47.040> can't<00:19:47.320> really<00:19:47.560> reason<00:19:47.880> about<00:19:48.159> this" + }, + { + "start": 1188.31, + "duration": 0.0, + "text": "know, you can't really reason about this" + }, + { + "start": 1188.32, + "duration": 0.0, + "text": "know, you can't really reason about this beforehand,<00:19:48.960> right?<00:19:49.120> Like<00:19:49.240> we<00:19:49.320> don't<00:19:49.560> know" + }, + { + "start": 1189.71, + "duration": 0.0, + "text": "beforehand, right? Like we don't know" + }, + { + "start": 1189.72, + "duration": 0.0, + "text": "beforehand, right? Like we don't know beforehand<00:19:50.280> that<00:19:50.400> dropping<00:19:50.760> the<00:19:50.840> bias<00:19:51.159> terms" + }, + { + "start": 1191.43, + "duration": 0.0, + "text": "beforehand that dropping the bias terms" + }, + { + "start": 1191.44, + "duration": 0.0, + "text": "beforehand that dropping the bias terms is<00:19:51.560> okay,<00:19:52.480> but<00:19:52.600> from<00:19:52.760> a<00:19:52.800> lot<00:19:53.040> of" + }, + { + "start": 1193.07, + "duration": 0.0, + "text": "is okay, but from a lot of" + }, + { + "start": 1193.08, + "duration": 0.0, + "text": "is okay, but from a lot of experimentation<00:19:53.960> and<00:19:54.040> now<00:19:54.200> collectively" + }, + { + "start": 1194.75, + "duration": 0.0, + "text": "experimentation and now collectively" + }, + { + "start": 1194.76, + "duration": 0.0, + "text": "experimentation and now collectively acquired<00:19:55.160> knowledge,<00:19:55.560> we<00:19:55.680> roughly<00:19:56.080> know<00:19:56.760> that" + }, + { + "start": 1197.03, + "duration": 0.0, + "text": "acquired knowledge, we roughly know that" + }, + { + "start": 1197.04, + "duration": 0.0, + "text": "acquired knowledge, we roughly know that dropping<00:19:57.360> the<00:19:57.480> bias<00:19:57.800> terms<00:19:58.160> on<00:19:58.400> both<00:19:58.600> the" + }, + { + "start": 1198.67, + "duration": 0.0, + "text": "dropping the bias terms on both the" + }, + { + "start": 1198.68, + "duration": 0.0, + "text": "dropping the bias terms on both the linear<00:19:59.000> and<00:19:59.120> RMS<00:19:59.440> norm<00:19:59.680> is<00:19:59.800> okay<00:20:00.520> for<00:20:00.680> typical" + }, + { + "start": 1201.23, + "duration": 0.0, + "text": "linear and RMS norm is okay for typical" + }, + { + "start": 1201.24, + "duration": 0.0, + "text": "linear and RMS norm is okay for typical language<00:20:01.560> modeling<00:20:01.960> workloads,<00:20:02.520> right?<00:20:03.040> Um" + }, + { + "start": 1203.11, + "duration": 0.0, + "text": "language modeling workloads, right? Um" + }, + { + "start": 1203.12, + "duration": 0.0, + "text": "language modeling workloads, right? Um this<00:20:03.240> is<00:20:03.360> the<00:20:03.480> kind<00:20:03.680> of<00:20:03.800> statement<00:20:04.160> that<00:20:04.240> we" + }, + { + "start": 1204.35, + "duration": 0.0, + "text": "this is the kind of statement that we" + }, + { + "start": 1204.36, + "duration": 0.0, + "text": "this is the kind of statement that we can<00:20:04.480> make<00:20:04.720> on<00:20:04.800> the<00:20:04.880> basis<00:20:05.320> of<00:20:05.800> of<00:20:05.960> what<00:20:06.120> we<00:20:06.240> do" + }, + { + "start": 1206.43, + "duration": 0.0, + "text": "can make on the basis of of what we do" + }, + { + "start": 1206.44, + "duration": 0.0, + "text": "can make on the basis of of what we do when<00:20:06.560> we<00:20:06.680> look<00:20:06.920> at<00:20:07.160> a<00:20:07.240> variety<00:20:07.600> of<00:20:07.680> different" + }, + { + "start": 1207.95, + "duration": 0.0, + "text": "when we look at a variety of different" + }, + { + "start": 1207.96, + "duration": 0.0, + "text": "when we look at a variety of different models." + }, + { + "start": 1209.63, + "duration": 0.0, + "text": "models." + }, + { + "start": 1209.64, + "duration": 0.0, + "text": "models. Okay,<00:20:10.120> any<00:20:10.240> questions<00:20:11.000> uh<00:20:11.200> for<00:20:11.320> layer<00:20:11.520> norm" + }, + { + "start": 1211.67, + "duration": 0.0, + "text": "Okay, any questions uh for layer norm" + }, + { + "start": 1211.68, + "duration": 0.0, + "text": "Okay, any questions uh for layer norm stuff?" + }, + { + "start": 1213.91, + "duration": 0.0, + "text": "stuff?" + }, + { + "start": 1213.92, + "duration": 0.0, + "text": "stuff? Good.<00:20:14.600> Okay.<00:20:15.200> So,<00:20:15.360> now<00:20:15.680> I'm<00:20:15.760> going<00:20:15.880> to<00:20:15.920> talk" + }, + { + "start": 1216.11, + "duration": 0.0, + "text": "Good. Okay. So, now I'm going to talk" + }, + { + "start": 1216.12, + "duration": 0.0, + "text": "Good. Okay. So, now I'm going to talk about<00:20:16.400> activations.<00:20:17.680> Um" + }, + { + "start": 1218.23, + "duration": 0.0, + "text": "about activations. Um" + }, + { + "start": 1218.24, + "duration": 0.0, + "text": "about activations. Um and<00:20:18.360> there's<00:20:18.520> a<00:20:18.640> whole<00:20:19.120> zoo<00:20:19.480> of<00:20:19.600> activations." + }, + { + "start": 1220.23, + "duration": 0.0, + "text": "and there's a whole zoo of activations." + }, + { + "start": 1220.24, + "duration": 0.0, + "text": "and there's a whole zoo of activations. There's<00:20:20.400> just<00:20:20.560> a<00:20:20.600> lot,<00:20:20.880> right?<00:20:21.120> Like<00:20:21.240> ReLU," + }, + { + "start": 1221.83, + "duration": 0.0, + "text": "There's just a lot, right? Like ReLU," + }, + { + "start": 1221.84, + "duration": 0.0, + "text": "There's just a lot, right? Like ReLU, GELU,<00:20:22.360> Swish<00:20:22.840> ELU,<00:20:23.680> GeGLU," + }, + { + "start": 1224.99, + "duration": 0.0, + "text": "GELU, Swish ELU, GeGLU," + }, + { + "start": 1225.0, + "duration": 0.0, + "text": "GELU, Swish ELU, GeGLU, SeLU,<00:20:25.560> SwiGLU,<00:20:26.120> LiGLU." + }, + { + "start": 1227.23, + "duration": 0.0, + "text": "SeLU, SwiGLU, LiGLU." + }, + { + "start": 1227.24, + "duration": 0.0, + "text": "SeLU, SwiGLU, LiGLU. Um<00:20:27.360> and<00:20:27.480> what<00:20:27.600> are<00:20:27.680> these<00:20:27.840> things?" + }, + { + "start": 1228.99, + "duration": 0.0, + "text": "Um and what are these things?" + }, + { + "start": 1229.0, + "duration": 0.0, + "text": "Um and what are these things? Um<00:20:29.520> I<00:20:29.600> think<00:20:29.800> at<00:20:29.880> what<00:20:30.080> point<00:20:30.440> one<00:20:30.600> point<00:20:30.960> of" + }, + { + "start": 1231.11, + "duration": 0.0, + "text": "Um I think at what point one point of" + }, + { + "start": 1231.12, + "duration": 0.0, + "text": "Um I think at what point one point of my,<00:20:31.480> you<00:20:31.600> know,<00:20:32.000> more<00:20:32.240> stats<00:20:32.680> ML<00:20:32.960> training,<00:20:33.800> I" + }, + { + "start": 1233.91, + "duration": 0.0, + "text": "my, you know, more stats ML training, I" + }, + { + "start": 1233.92, + "duration": 0.0, + "text": "my, you know, more stats ML training, I thought<00:20:34.160> to<00:20:34.240> myself<00:20:34.640> I<00:20:34.720> will<00:20:34.880> never<00:20:35.160> learn" + }, + { + "start": 1235.39, + "duration": 0.0, + "text": "thought to myself I will never learn" + }, + { + "start": 1235.4, + "duration": 0.0, + "text": "thought to myself I will never learn these<00:20:35.600> things.<00:20:36.000> I<00:20:36.040> will<00:20:36.160> make<00:20:36.320> it<00:20:36.400> a<00:20:36.440> point<00:20:36.680> of" + }, + { + "start": 1236.79, + "duration": 0.0, + "text": "these things. I will make it a point of" + }, + { + "start": 1236.8, + "duration": 0.0, + "text": "these things. I will make it a point of pride<00:20:37.080> to<00:20:37.160> never<00:20:37.400> know<00:20:37.560> what<00:20:37.680> a<00:20:37.720> SwiGLU<00:20:38.160> is.<00:20:38.800> Um" + }, + { + "start": 1238.91, + "duration": 0.0, + "text": "pride to never know what a SwiGLU is. Um" + }, + { + "start": 1238.92, + "duration": 0.0, + "text": "pride to never know what a SwiGLU is. Um but<00:20:39.080> now<00:20:39.240> it's<00:20:39.360> actually<00:20:39.600> very<00:20:39.840> important<00:20:40.560> for" + }, + { + "start": 1240.71, + "duration": 0.0, + "text": "but now it's actually very important for" + }, + { + "start": 1240.72, + "duration": 0.0, + "text": "but now it's actually very important for us<00:20:40.880> to<00:20:41.320> to<00:20:41.400> actually<00:20:41.640> like<00:20:42.000> have<00:20:42.200> a<00:20:42.280> general" + }, + { + "start": 1242.75, + "duration": 0.0, + "text": "us to to actually like have a general" + }, + { + "start": 1242.76, + "duration": 0.0, + "text": "us to to actually like have a general sense<00:20:43.000> of<00:20:43.120> what<00:20:43.240> these<00:20:43.440> objects<00:20:43.920> are<00:20:44.640> um<00:20:44.920> and" + }, + { + "start": 1245.03, + "duration": 0.0, + "text": "sense of what these objects are um and" + }, + { + "start": 1245.04, + "duration": 0.0, + "text": "sense of what these objects are um and which<00:20:45.280> parts<00:20:45.640> of<00:20:45.760> these<00:20:46.000> names<00:20:46.440> actually<00:20:46.840> kind" + }, + { + "start": 1246.99, + "duration": 0.0, + "text": "which parts of these names actually kind" + }, + { + "start": 1247.0, + "duration": 0.0, + "text": "which parts of these names actually kind of<00:20:47.200> matter<00:20:47.600> for<00:20:47.800> performance,<00:20:48.680> right?" + }, + { + "start": 1250.07, + "duration": 0.0, + "text": "of matter for performance, right?" + }, + { + "start": 1250.08, + "duration": 0.0, + "text": "of matter for performance, right? Um<00:20:51.000> So,<00:20:51.800> you<00:20:51.960> can<00:20:52.520> build<00:20:52.880> and<00:20:53.000> train<00:20:53.440> a" + }, + { + "start": 1253.55, + "duration": 0.0, + "text": "Um So, you can build and train a" + }, + { + "start": 1253.56, + "duration": 0.0, + "text": "Um So, you can build and train a language<00:20:54.000> model<00:20:54.760> on<00:20:55.080> just<00:20:55.360> a<00:20:55.440> fairly<00:20:55.920> vanilla" + }, + { + "start": 1256.43, + "duration": 0.0, + "text": "language model on just a fairly vanilla" + }, + { + "start": 1256.44, + "duration": 0.0, + "text": "language model on just a fairly vanilla activation.<00:20:57.760> Um" + }, + { + "start": 1258.39, + "duration": 0.0, + "text": "activation. Um" + }, + { + "start": 1258.4, + "duration": 0.0, + "text": "activation. Um even,<00:20:59.000> you<00:20:59.120> know,<00:20:59.320> I<00:20:59.400> guess<00:20:59.640> Chinchilla<00:21:00.200> is" + }, + { + "start": 1260.35, + "duration": 0.0, + "text": "even, you know, I guess Chinchilla is" + }, + { + "start": 1260.36, + "duration": 0.0, + "text": "even, you know, I guess Chinchilla is probably<00:21:00.680> the<00:21:00.800> best<00:21:01.120> model<00:21:01.480> out<00:21:01.640> of<00:21:01.760> that" + }, + { + "start": 1261.95, + "duration": 0.0, + "text": "probably the best model out of that" + }, + { + "start": 1261.96, + "duration": 0.0, + "text": "probably the best model out of that group,<00:21:02.240> but<00:21:02.400> even<00:21:02.600> if<00:21:02.720> you<00:21:02.800> just<00:21:03.000> want<00:21:03.200> a<00:21:03.280> ReLU," + }, + { + "start": 1264.23, + "duration": 0.0, + "text": "group, but even if you just want a ReLU," + }, + { + "start": 1264.24, + "duration": 0.0, + "text": "group, but even if you just want a ReLU, you<00:21:04.360> know,<00:21:04.480> you<00:21:04.560> can<00:21:04.680> train<00:21:04.960> a<00:21:05.040> reasonably" + }, + { + "start": 1265.63, + "duration": 0.0, + "text": "you know, you can train a reasonably" + }, + { + "start": 1265.64, + "duration": 0.0, + "text": "you know, you can train a reasonably performant<00:21:06.200> language<00:21:06.560> model<00:21:06.880> using<00:21:07.160> just" + }, + { + "start": 1267.39, + "duration": 0.0, + "text": "performant language model using just" + }, + { + "start": 1267.4, + "duration": 0.0, + "text": "performant language model using just that<00:21:07.640> activation.<00:21:08.160> There's<00:21:08.320> nothing<00:21:08.680> wrong" + }, + { + "start": 1268.95, + "duration": 0.0, + "text": "that activation. There's nothing wrong" + }, + { + "start": 1268.96, + "duration": 0.0, + "text": "that activation. There's nothing wrong with<00:21:09.120> that,<00:21:09.400> right?<00:21:10.000> And<00:21:10.080> if<00:21:10.160> we<00:21:10.280> move<00:21:10.560> to" + }, + { + "start": 1270.67, + "duration": 0.0, + "text": "with that, right? And if we move to" + }, + { + "start": 1270.68, + "duration": 0.0, + "text": "with that, right? And if we move to GELU,<00:21:11.200> which<00:21:11.520> is<00:21:11.720> a<00:21:11.840> Gaussian<00:21:12.320> error<00:21:12.600> unit," + }, + { + "start": 1272.91, + "duration": 0.0, + "text": "GELU, which is a Gaussian error unit," + }, + { + "start": 1272.92, + "duration": 0.0, + "text": "GELU, which is a Gaussian error unit, and<00:21:13.080> really<00:21:13.320> the<00:21:13.480> only<00:21:13.720> difference<00:21:14.080> is<00:21:14.200> this" + }, + { + "start": 1274.39, + "duration": 0.0, + "text": "and really the only difference is this" + }, + { + "start": 1274.4, + "duration": 0.0, + "text": "and really the only difference is this tiny<00:21:14.960> divot<00:21:15.360> at<00:21:15.440> the<00:21:15.520> bottom<00:21:15.880> here,<00:21:16.520> which" + }, + { + "start": 1277.07, + "duration": 0.0, + "text": "tiny divot at the bottom here, which" + }, + { + "start": 1277.08, + "duration": 0.0, + "text": "tiny divot at the bottom here, which really,<00:21:17.640> you<00:21:17.760> know,<00:21:18.000> for<00:21:18.160> the<00:21:18.280> most<00:21:18.760> of<00:21:18.880> the" + }, + { + "start": 1279.03, + "duration": 0.0, + "text": "really, you know, for the most of the" + }, + { + "start": 1279.04, + "duration": 0.0, + "text": "really, you know, for the most of the activation<00:21:19.600> doesn't<00:21:19.880> change<00:21:20.080> anything,<00:21:20.360> but" + }, + { + "start": 1280.47, + "duration": 0.0, + "text": "activation doesn't change anything, but" + }, + { + "start": 1280.48, + "duration": 0.0, + "text": "activation doesn't change anything, but changes<00:21:20.800> the<00:21:20.880> gradients<00:21:21.400> right<00:21:21.560> near<00:21:21.760> zero," + }, + { + "start": 1282.67, + "duration": 0.0, + "text": "changes the gradients right near zero," + }, + { + "start": 1282.68, + "duration": 0.0, + "text": "changes the gradients right near zero, um<00:21:23.120> then<00:21:23.280> you<00:21:23.360> can<00:21:23.480> train<00:21:23.800> models<00:21:24.360> like<00:21:24.560> GPT-3," + }, + { + "start": 1285.19, + "duration": 0.0, + "text": "um then you can train models like GPT-3," + }, + { + "start": 1285.2, + "duration": 0.0, + "text": "um then you can train models like GPT-3, right?<00:21:25.360> That's<00:21:25.480> a<00:21:25.520> perfectly<00:21:26.120> good<00:21:26.480> large" + }, + { + "start": 1286.75, + "duration": 0.0, + "text": "right? That's a perfectly good large" + }, + { + "start": 1286.76, + "duration": 0.0, + "text": "right? That's a perfectly good large language<00:21:27.080> model,<00:21:27.600> not,<00:21:28.040> you<00:21:28.120> know,<00:21:28.240> modern<00:21:28.680> by" + }, + { + "start": 1288.83, + "duration": 0.0, + "text": "language model, not, you know, modern by" + }, + { + "start": 1288.84, + "duration": 0.0, + "text": "language model, not, you know, modern by modern<00:21:29.160> standards,<00:21:29.760> but<00:21:30.360> perfectly<00:21:30.760> fine." + }, + { + "start": 1291.67, + "duration": 0.0, + "text": "modern standards, but perfectly fine." + }, + { + "start": 1291.68, + "duration": 0.0, + "text": "modern standards, but perfectly fine. Um<00:21:32.040> but<00:21:32.240> then,<00:21:32.760> you<00:21:32.840> know,<00:21:33.040> we<00:21:33.160> get<00:21:33.360> to<00:21:33.440> the" + }, + { + "start": 1293.55, + "duration": 0.0, + "text": "Um but then, you know, we get to the" + }, + { + "start": 1293.56, + "duration": 0.0, + "text": "Um but then, you know, we get to the gated<00:21:33.920> linear<00:21:34.200> units<00:21:34.680> like<00:21:34.840> SwiGLU<00:21:35.280> and" + }, + { + "start": 1295.43, + "duration": 0.0, + "text": "gated linear units like SwiGLU and" + }, + { + "start": 1295.44, + "duration": 0.0, + "text": "gated linear units like SwiGLU and GeGLU,<00:21:36.280> and<00:21:36.400> these<00:21:36.560> are<00:21:36.720> really<00:21:37.040> where<00:21:37.240> most" + }, + { + "start": 1297.67, + "duration": 0.0, + "text": "GeGLU, and these are really where most" + }, + { + "start": 1297.68, + "duration": 0.0, + "text": "GeGLU, and these are really where most of<00:21:37.800> the<00:21:38.000> action<00:21:38.440> is.<00:21:38.840> You<00:21:38.920> know,<00:21:39.000> this<00:21:39.160> is<00:21:39.280> very" + }, + { + "start": 1299.47, + "duration": 0.0, + "text": "of the action is. You know, this is very" + }, + { + "start": 1299.48, + "duration": 0.0, + "text": "of the action is. You know, this is very similar<00:21:39.960> to<00:21:40.360> layer<00:21:40.680> norm<00:21:41.440> in<00:21:41.560> that<00:21:41.920> I<00:21:41.960> think" + }, + { + "start": 1302.23, + "duration": 0.0, + "text": "similar to layer norm in that I think" + }, + { + "start": 1302.24, + "duration": 0.0, + "text": "similar to layer norm in that I think almost<00:21:42.840> all<00:21:43.680> credible<00:21:44.600> modern<00:21:45.040> language" + }, + { + "start": 1305.43, + "duration": 0.0, + "text": "almost all credible modern language" + }, + { + "start": 1305.44, + "duration": 0.0, + "text": "almost all credible modern language models<00:21:46.120> use<00:21:46.360> a<00:21:46.440> gated<00:21:46.880> linear<00:21:47.200> unit<00:21:47.600> of<00:21:47.760> some" + }, + { + "start": 1308.07, + "duration": 0.0, + "text": "models use a gated linear unit of some" + }, + { + "start": 1308.08, + "duration": 0.0, + "text": "models use a gated linear unit of some kind." + }, + { + "start": 1309.87, + "duration": 0.0, + "text": "kind." + }, + { + "start": 1309.88, + "duration": 0.0, + "text": "kind. Okay,<00:21:50.360> so<00:21:50.560> what<00:21:50.760> is<00:21:50.880> a<00:21:50.920> gated<00:21:51.400> linear<00:21:51.760> unit?" + }, + { + "start": 1312.43, + "duration": 0.0, + "text": "Okay, so what is a gated linear unit?" + }, + { + "start": 1312.44, + "duration": 0.0, + "text": "Okay, so what is a gated linear unit? So,<00:21:52.600> these<00:21:52.800> are<00:21:52.960> gated<00:21:53.480> activations.<00:21:54.360> So,<00:21:54.560> if" + }, + { + "start": 1314.67, + "duration": 0.0, + "text": "So, these are gated activations. So, if" + }, + { + "start": 1314.68, + "duration": 0.0, + "text": "So, these are gated activations. So, if we<00:21:54.800> want<00:21:55.000> to<00:21:55.080> look<00:21:55.240> at<00:21:55.360> something<00:21:55.680> like<00:21:55.880> a<00:21:55.960> feed" + }, + { + "start": 1316.35, + "duration": 0.0, + "text": "we want to look at something like a feed" + }, + { + "start": 1316.36, + "duration": 0.0, + "text": "we want to look at something like a feed forward<00:21:56.720> layer,<00:21:57.600> um<00:21:57.760> we<00:21:57.840> can<00:21:57.960> just<00:21:58.120> look<00:21:58.240> at" + }, + { + "start": 1318.31, + "duration": 0.0, + "text": "forward layer, um we can just look at" + }, + { + "start": 1318.32, + "duration": 0.0, + "text": "forward layer, um we can just look at this<00:21:58.480> first<00:21:58.840> part.<00:21:59.080> This<00:21:59.240> is,<00:21:59.360> you<00:21:59.440> know,<00:21:59.560> a" + }, + { + "start": 1319.59, + "duration": 0.0, + "text": "this first part. This is, you know, a" + }, + { + "start": 1319.6, + "duration": 0.0, + "text": "this first part. This is, you know, a very<00:22:00.040> standard<00:22:00.600> ReLU<00:22:00.960> feed<00:22:01.280> forward,<00:22:01.720> right?" + }, + { + "start": 1321.95, + "duration": 0.0, + "text": "very standard ReLU feed forward, right?" + }, + { + "start": 1321.96, + "duration": 0.0, + "text": "very standard ReLU feed forward, right? I<00:22:02.000> have<00:22:02.160> my<00:22:02.360> X,<00:22:03.000> I<00:22:03.120> hit<00:22:03.280> it<00:22:03.360> with<00:22:03.520> a<00:22:03.560> W1,<00:22:04.720> you" + }, + { + "start": 1324.79, + "duration": 0.0, + "text": "I have my X, I hit it with a W1, you" + }, + { + "start": 1324.8, + "duration": 0.0, + "text": "I have my X, I hit it with a W1, you know,<00:22:04.960> I<00:22:05.160> I<00:22:05.320> entry-wise<00:22:06.040> threshold<00:22:06.400> at<00:22:06.560> zero," + }, + { + "start": 1326.95, + "duration": 0.0, + "text": "know, I I entry-wise threshold at zero," + }, + { + "start": 1326.96, + "duration": 0.0, + "text": "know, I I entry-wise threshold at zero, and<00:22:07.080> then<00:22:07.200> I<00:22:07.280> hit<00:22:07.440> it<00:22:07.520> with<00:22:07.640> another<00:22:07.920> W2,<00:22:08.480> I<00:22:08.560> get" + }, + { + "start": 1328.669, + "duration": 0.0, + "text": "and then I hit it with another W2, I get" + }, + { + "start": 1328.679, + "duration": 0.0, + "text": "and then I hit it with another W2, I get my<00:22:08.880> output,<00:22:09.200> right?<00:22:09.480> Very<00:22:09.840> straightforward" + }, + { + "start": 1330.47, + "duration": 0.0, + "text": "my output, right? Very straightforward" + }, + { + "start": 1330.48, + "duration": 0.0, + "text": "my output, right? Very straightforward ReLU<00:22:10.800> network." + }, + { + "start": 1332.11, + "duration": 0.0, + "text": "ReLU network." + }, + { + "start": 1332.12, + "duration": 0.0, + "text": "ReLU network. Um" + }, + { + "start": 1333.47, + "duration": 0.0, + "text": "Um" + }, + { + "start": 1333.48, + "duration": 0.0, + "text": "Um Another<00:22:13.800> thing,<00:22:14.560> um<00:22:14.800> I<00:22:14.880> don't<00:22:15.120> say<00:22:15.280> this<00:22:15.480> as<00:22:15.600> my" + }, + { + "start": 1335.71, + "duration": 0.0, + "text": "Another thing, um I don't say this as my" + }, + { + "start": 1335.72, + "duration": 0.0, + "text": "Another thing, um I don't say this as my personal<00:22:16.160> experience,<00:22:16.760> but<00:22:16.880> another<00:22:17.200> thing" + }, + { + "start": 1337.43, + "duration": 0.0, + "text": "personal experience, but another thing" + }, + { + "start": 1337.44, + "duration": 0.0, + "text": "personal experience, but another thing that<00:22:17.560> is<00:22:17.720> often<00:22:18.080> said<00:22:18.360> in<00:22:18.480> architecture" + }, + { + "start": 1338.91, + "duration": 0.0, + "text": "that is often said in architecture" + }, + { + "start": 1338.92, + "duration": 0.0, + "text": "that is often said in architecture design<00:22:19.679> is<00:22:19.840> that<00:22:20.120> gating<00:22:20.520> is<00:22:20.679> often<00:22:21.000> very" + }, + { + "start": 1341.27, + "duration": 0.0, + "text": "design is that gating is often very" + }, + { + "start": 1341.28, + "duration": 0.0, + "text": "design is that gating is often very helpful.<00:22:21.840> So,<00:22:21.920> if<00:22:22.080> you<00:22:22.160> apply<00:22:22.560> that<00:22:22.760> very" + }, + { + "start": 1342.95, + "duration": 0.0, + "text": "helpful. So, if you apply that very" + }, + { + "start": 1342.96, + "duration": 0.0, + "text": "helpful. So, if you apply that very general<00:22:23.320> heuristic,<00:22:24.280> what<00:22:24.440> you<00:22:24.560> might<00:22:24.760> get<00:22:24.960> is" + }, + { + "start": 1345.07, + "duration": 0.0, + "text": "general heuristic, what you might get is" + }, + { + "start": 1345.08, + "duration": 0.0, + "text": "general heuristic, what you might get is to<00:22:25.200> say,<00:22:25.480> \"Okay,<00:22:26.160> well,<00:22:26.720> instead<00:22:27.120> of<00:22:27.240> just" + }, + { + "start": 1347.59, + "duration": 0.0, + "text": "to say, \"Okay, well, instead of just" + }, + { + "start": 1347.6, + "duration": 0.0, + "text": "to say, \"Okay, well, instead of just having,<00:22:28.240> you<00:22:28.320> know,<00:22:28.440> this<00:22:28.640> entry-wise<00:22:29.280> ReLU," + }, + { + "start": 1350.39, + "duration": 0.0, + "text": "having, you know, this entry-wise ReLU," + }, + { + "start": 1350.4, + "duration": 0.0, + "text": "having, you know, this entry-wise ReLU, why<00:22:30.520> don't<00:22:30.679> we<00:22:30.880> also<00:22:31.200> have<00:22:31.360> a<00:22:31.440> gate?<00:22:32.040> And<00:22:32.160> the" + }, + { + "start": 1352.23, + "duration": 0.0, + "text": "why don't we also have a gate? And the" + }, + { + "start": 1352.24, + "duration": 0.0, + "text": "why don't we also have a gate? And the second<00:22:32.679> gate,<00:22:33.360> the<00:22:33.520> second<00:22:33.840> term<00:22:34.080> here,<00:22:34.679> this" + }, + { + "start": 1354.87, + "duration": 0.0, + "text": "second gate, the second term here, this" + }, + { + "start": 1354.88, + "duration": 0.0, + "text": "second gate, the second term here, this is<00:22:35.000> just<00:22:35.160> going<00:22:35.320> to<00:22:35.440> multiply<00:22:36.280> the<00:22:36.520> output<00:22:36.840> of" + }, + { + "start": 1356.95, + "duration": 0.0, + "text": "is just going to multiply the output of" + }, + { + "start": 1356.96, + "duration": 0.0, + "text": "is just going to multiply the output of my<00:22:37.120> ReLU<00:22:38.040> entry-wise,<00:22:38.960> and<00:22:39.080> I<00:22:39.120> have<00:22:39.320> a<00:22:39.720> second" + }, + { + "start": 1360.07, + "duration": 0.0, + "text": "my ReLU entry-wise, and I have a second" + }, + { + "start": 1360.08, + "duration": 0.0, + "text": "my ReLU entry-wise, and I have a second matrix<00:22:40.480> V,<00:22:40.920> okay?\"" + }, + { + "start": 1361.99, + "duration": 0.0, + "text": "matrix V, okay?\"" + }, + { + "start": 1362.0, + "duration": 0.0, + "text": "matrix V, okay?\" Now,<00:22:42.120> this<00:22:42.240> is<00:22:42.320> just<00:22:42.440> going<00:22:42.560> to<00:22:42.640> modulate<00:22:43.320> the" + }, + { + "start": 1363.51, + "duration": 0.0, + "text": "Now, this is just going to modulate the" + }, + { + "start": 1363.52, + "duration": 0.0, + "text": "Now, this is just going to modulate the output<00:22:44.000> of<00:22:44.120> my<00:22:44.320> ReLU." + }, + { + "start": 1365.39, + "duration": 0.0, + "text": "output of my ReLU." + }, + { + "start": 1365.4, + "duration": 0.0, + "text": "output of my ReLU. Um<00:22:45.880> and<00:22:46.040> then<00:22:46.440> I'm<00:22:46.520> going<00:22:46.679> to<00:22:46.920> do<00:22:47.160> everything" + }, + { + "start": 1367.59, + "duration": 0.0, + "text": "Um and then I'm going to do everything" + }, + { + "start": 1367.6, + "duration": 0.0, + "text": "Um and then I'm going to do everything else<00:22:47.800> the<00:22:47.920> same,<00:22:48.440> right?<00:22:48.640> So,<00:22:48.760> instead<00:22:49.200> of," + }, + { + "start": 1369.91, + "duration": 0.0, + "text": "else the same, right? So, instead of," + }, + { + "start": 1369.92, + "duration": 0.0, + "text": "else the same, right? So, instead of, you<00:22:50.000> know,<00:22:50.120> just<00:22:50.360> having<00:22:50.920> XW1W2,<00:22:52.400> I<00:22:52.480> have<00:22:52.800> XW1," + }, + { + "start": 1373.63, + "duration": 0.0, + "text": "you know, just having XW1W2, I have XW1," + }, + { + "start": 1373.64, + "duration": 0.0, + "text": "you know, just having XW1W2, I have XW1, and<00:22:53.720> I'm<00:22:53.800> going<00:22:53.920> to<00:22:54.040> gate<00:22:54.360> that<00:22:54.600> with<00:22:54.960> XV.<00:22:55.520> This" + }, + { + "start": 1375.71, + "duration": 0.0, + "text": "and I'm going to gate that with XV. This" + }, + { + "start": 1375.72, + "duration": 0.0, + "text": "and I'm going to gate that with XV. This is<00:22:55.800> another" + }, + { + "start": 1377.03, + "duration": 0.0, + "text": "is another" + }, + { + "start": 1377.04, + "duration": 0.0, + "text": "is another uh<00:22:57.120> activation<00:22:57.679> the<00:22:57.760> same<00:22:58.000> size<00:22:58.240> as<00:22:58.360> this,<00:22:59.000> and" + }, + { + "start": 1379.15, + "duration": 0.0, + "text": "uh activation the same size as this, and" + }, + { + "start": 1379.16, + "duration": 0.0, + "text": "uh activation the same size as this, and then<00:22:59.320> I'm<00:22:59.440> going<00:22:59.640> to,<00:23:00.160> you<00:23:00.240> know,<00:23:00.360> down" + }, + { + "start": 1380.63, + "duration": 0.0, + "text": "then I'm going to, you know, down" + }, + { + "start": 1380.64, + "duration": 0.0, + "text": "then I'm going to, you know, down project<00:23:01.040> it<00:23:01.160> back<00:23:01.480> with<00:23:01.640> W2." + }, + { + "start": 1383.15, + "duration": 0.0, + "text": "project it back with W2." + }, + { + "start": 1383.16, + "duration": 0.0, + "text": "project it back with W2. Okay,<00:23:03.400> so<00:23:03.520> what<00:23:03.679> is<00:23:03.800> this?<00:23:04.320> Now,<00:23:04.440> this<00:23:04.600> is<00:23:04.720> a" + }, + { + "start": 1384.83, + "duration": 0.0, + "text": "Okay, so what is this? Now, this is a" + }, + { + "start": 1384.84, + "duration": 0.0, + "text": "Okay, so what is this? Now, this is a RegLU.<00:23:05.440> This<00:23:05.640> is<00:23:05.840> a<00:23:06.320> you<00:23:06.520> you<00:23:06.679> make<00:23:06.880> these" + }, + { + "start": 1387.03, + "duration": 0.0, + "text": "RegLU. This is a you you make these" + }, + { + "start": 1387.04, + "duration": 0.0, + "text": "RegLU. This is a you you make these names<00:23:07.320> by<00:23:07.440> adding<00:23:07.720> the<00:23:07.840> first<00:23:08.200> activation,<00:23:08.840> in" + }, + { + "start": 1388.95, + "duration": 0.0, + "text": "names by adding the first activation, in" + }, + { + "start": 1388.96, + "duration": 0.0, + "text": "names by adding the first activation, in this<00:23:09.080> case<00:23:09.280> ReLU,<00:23:09.960> and<00:23:10.120> GLU,<00:23:10.720> right?<00:23:10.920> So,<00:23:11.000> the" + }, + { + "start": 1391.11, + "duration": 0.0, + "text": "this case ReLU, and GLU, right? So, the" + }, + { + "start": 1391.12, + "duration": 0.0, + "text": "this case ReLU, and GLU, right? So, the ReLU<00:23:11.720> gated<00:23:12.120> linear<00:23:12.480> unit." + }, + { + "start": 1393.47, + "duration": 0.0, + "text": "ReLU gated linear unit." + }, + { + "start": 1393.48, + "duration": 0.0, + "text": "ReLU gated linear unit. Um<00:23:14.000> and<00:23:14.160> gating<00:23:14.560> has<00:23:14.760> been<00:23:14.920> a,<00:23:15.120> you<00:23:15.240> know,<00:23:15.600> very" + }, + { + "start": 1395.95, + "duration": 0.0, + "text": "Um and gating has been a, you know, very" + }, + { + "start": 1395.96, + "duration": 0.0, + "text": "Um and gating has been a, you know, very effective<00:23:16.640> other<00:23:16.920> primitive<00:23:17.760> in" + }, + { + "start": 1398.15, + "duration": 0.0, + "text": "effective other primitive in" + }, + { + "start": 1398.16, + "duration": 0.0, + "text": "effective other primitive in architecture<00:23:18.679> design,<00:23:19.080> and<00:23:19.200> it<00:23:19.280> turns<00:23:19.600> out" + }, + { + "start": 1400.11, + "duration": 0.0, + "text": "architecture design, and it turns out" + }, + { + "start": 1400.12, + "duration": 0.0, + "text": "architecture design, and it turns out that<00:23:20.240> this<00:23:20.360> is<00:23:20.440> very<00:23:20.679> effective<00:23:21.400> in<00:23:21.520> language" + }, + { + "start": 1401.87, + "duration": 0.0, + "text": "that this is very effective in language" + }, + { + "start": 1401.88, + "duration": 0.0, + "text": "that this is very effective in language modeling<00:23:22.240> as<00:23:22.440> well." + }, + { + "start": 1403.669, + "duration": 0.0, + "text": "modeling as well." + }, + { + "start": 1403.679, + "duration": 0.0, + "text": "modeling as well. So," + }, + { + "start": 1404.71, + "duration": 0.0, + "text": "So," + }, + { + "start": 1404.72, + "duration": 0.0, + "text": "So, um<00:23:24.880> if<00:23:25.040> you<00:23:25.160> take<00:23:25.760> something<00:23:26.160> like<00:23:26.360> a<00:23:26.440> GELU," + }, + { + "start": 1406.95, + "duration": 0.0, + "text": "um if you take something like a GELU," + }, + { + "start": 1406.96, + "duration": 0.0, + "text": "um if you take something like a GELU, we've<00:23:27.120> already<00:23:27.360> talked<00:23:27.600> about<00:23:27.800> that,<00:23:28.000> right?" + }, + { + "start": 1408.15, + "duration": 0.0, + "text": "we've already talked about that, right?" + }, + { + "start": 1408.16, + "duration": 0.0, + "text": "we've already talked about that, right? That's<00:23:28.320> like<00:23:28.440> the<00:23:28.560> ReLU,<00:23:28.960> but<00:23:29.080> with<00:23:29.200> a<00:23:29.240> little" + }, + { + "start": 1409.51, + "duration": 0.0, + "text": "That's like the ReLU, but with a little" + }, + { + "start": 1409.52, + "duration": 0.0, + "text": "That's like the ReLU, but with a little divot<00:23:29.840> at<00:23:29.920> the<00:23:30.000> bottom<00:23:30.360> here,<00:23:31.000> um<00:23:31.400> you<00:23:31.520> will" + }, + { + "start": 1411.669, + "duration": 0.0, + "text": "divot at the bottom here, um you will" + }, + { + "start": 1411.679, + "duration": 0.0, + "text": "divot at the bottom here, um you will get<00:23:31.960> a<00:23:32.080> GeGLU.<00:23:33.240> Um<00:23:33.760> and<00:23:33.960> if<00:23:34.080> you<00:23:34.240> take<00:23:34.440> a" + }, + { + "start": 1414.51, + "duration": 0.0, + "text": "get a GeGLU. Um and if you take a" + }, + { + "start": 1414.52, + "duration": 0.0, + "text": "get a GeGLU. Um and if you take a SwiGLU,<00:23:35.160> which<00:23:35.360> is<00:23:35.520> X<00:23:35.720> times<00:23:36.000> a<00:23:36.080> sigmoid,<00:23:37.160> then" + }, + { + "start": 1417.59, + "duration": 0.0, + "text": "SwiGLU, which is X times a sigmoid, then" + }, + { + "start": 1417.6, + "duration": 0.0, + "text": "SwiGLU, which is X times a sigmoid, then you<00:23:37.720> will<00:23:37.840> get<00:23:38.120> a<00:23:38.400> SwiGLU.<00:23:39.320> So,<00:23:39.480> this<00:23:39.600> is<00:23:39.679> a" + }, + { + "start": 1419.75, + "duration": 0.0, + "text": "you will get a SwiGLU. So, this is a" + }, + { + "start": 1419.76, + "duration": 0.0, + "text": "you will get a SwiGLU. So, this is a Swish<00:23:40.560> times,<00:23:41.080> you<00:23:41.160> know,<00:23:41.280> the<00:23:41.400> rest<00:23:41.640> of<00:23:41.760> it." + }, + { + "start": 1422.43, + "duration": 0.0, + "text": "Swish times, you know, the rest of it." + }, + { + "start": 1422.44, + "duration": 0.0, + "text": "Swish times, you know, the rest of it. Um<00:23:42.720> and<00:23:42.880> this<00:23:43.120> really<00:23:43.400> covers<00:23:44.520> a<00:23:44.600> lot<00:23:44.960> of<00:23:45.120> the" + }, + { + "start": 1425.23, + "duration": 0.0, + "text": "Um and this really covers a lot of the" + }, + { + "start": 1425.24, + "duration": 0.0, + "text": "Um and this really covers a lot of the modern<00:23:45.640> models,<00:23:46.120> right?<00:23:46.720> Um<00:23:46.960> generally<00:23:47.520> the" + }, + { + "start": 1427.669, + "duration": 0.0, + "text": "modern models, right? Um generally the" + }, + { + "start": 1427.679, + "duration": 0.0, + "text": "modern models, right? Um generally the Google<00:23:48.080> folks<00:23:48.400> have<00:23:48.560> used<00:23:48.920> GeGLU,<00:23:49.560> so<00:23:49.760> like" + }, + { + "start": 1430.03, + "duration": 0.0, + "text": "Google folks have used GeGLU, so like" + }, + { + "start": 1430.04, + "duration": 0.0, + "text": "Google folks have used GeGLU, so like the<00:23:50.120> Gemma<00:23:50.480> models,<00:23:50.960> the<00:23:51.080> T5<00:23:51.679> models<00:23:52.080> are" + }, + { + "start": 1432.19, + "duration": 0.0, + "text": "the Gemma models, the T5 models are" + }, + { + "start": 1432.2, + "duration": 0.0, + "text": "the Gemma models, the T5 models are those.<00:23:53.120> Um<00:23:53.440> and<00:23:53.600> everything<00:23:53.960> that's<00:23:54.120> kind<00:23:54.280> of" + }, + { + "start": 1434.35, + "duration": 0.0, + "text": "those. Um and everything that's kind of" + }, + { + "start": 1434.36, + "duration": 0.0, + "text": "those. Um and everything that's kind of like<00:23:54.480> a<00:23:54.520> LLaMA<00:23:55.000> descendant<00:23:55.600> uses<00:23:55.840> a<00:23:55.920> SwiGLU." + }, + { + "start": 1437.15, + "duration": 0.0, + "text": "like a LLaMA descendant uses a SwiGLU." + }, + { + "start": 1437.16, + "duration": 0.0, + "text": "like a LLaMA descendant uses a SwiGLU. Um<00:23:57.280> so,<00:23:57.800> PaLM<00:23:58.480> and<00:23:58.640> the<00:23:58.720> LLaMA<00:23:59.000> descendants" + }, + { + "start": 1439.91, + "duration": 0.0, + "text": "Um so, PaLM and the LLaMA descendants" + }, + { + "start": 1439.92, + "duration": 0.0, + "text": "Um so, PaLM and the LLaMA descendants are<00:24:00.160> all<00:24:00.440> kind<00:24:00.600> of<00:24:00.720> SwiGLU<00:24:01.200> models.<00:24:02.160> Um" + }, + { + "start": 1443.15, + "duration": 0.0, + "text": "are all kind of SwiGLU models. Um" + }, + { + "start": 1443.16, + "duration": 0.0, + "text": "are all kind of SwiGLU models. Um I<00:24:03.280> would<00:24:03.480> say<00:24:03.640> that<00:24:03.960> SwiGLU<00:24:04.440> is<00:24:04.560> probably<00:24:04.880> the" + }, + { + "start": 1444.95, + "duration": 0.0, + "text": "I would say that SwiGLU is probably the" + }, + { + "start": 1444.96, + "duration": 0.0, + "text": "I would say that SwiGLU is probably the more<00:24:05.160> dominant<00:24:05.640> one,<00:24:06.160> but<00:24:06.440> honestly<00:24:06.840> amongst" + }, + { + "start": 1447.19, + "duration": 0.0, + "text": "more dominant one, but honestly amongst" + }, + { + "start": 1447.2, + "duration": 0.0, + "text": "more dominant one, but honestly amongst the<00:24:07.320> gated<00:24:07.800> units,<00:24:08.440> doesn't<00:24:08.840> really<00:24:09.120> matter." + }, + { + "start": 1450.39, + "duration": 0.0, + "text": "the gated units, doesn't really matter." + }, + { + "start": 1450.4, + "duration": 0.0, + "text": "the gated units, doesn't really matter. Now,<00:24:10.560> here's<00:24:10.800> a<00:24:10.880> side<00:24:11.200> note<00:24:11.400> that<00:24:11.560> will<00:24:12.480> uh<00:24:12.560> be" + }, + { + "start": 1452.75, + "duration": 0.0, + "text": "Now, here's a side note that will uh be" + }, + { + "start": 1452.76, + "duration": 0.0, + "text": "Now, here's a side note that will uh be a<00:24:13.320> semi-important<00:24:14.240> piece<00:24:14.440> of<00:24:14.560> trivia<00:24:14.960> later." + }, + { + "start": 1456.03, + "duration": 0.0, + "text": "a semi-important piece of trivia later." + }, + { + "start": 1456.04, + "duration": 0.0, + "text": "a semi-important piece of trivia later. Um<00:24:16.200> if<00:24:16.360> you<00:24:16.480> look<00:24:16.760> up<00:24:16.920> here,<00:24:17.720> right?<00:24:18.640> Um<00:24:18.880> you" + }, + { + "start": 1458.99, + "duration": 0.0, + "text": "Um if you look up here, right? Um you" + }, + { + "start": 1459.0, + "duration": 0.0, + "text": "Um if you look up here, right? Um you will<00:24:19.120> notice<00:24:19.560> that<00:24:19.840> there<00:24:19.960> are<00:24:20.040> more" + }, + { + "start": 1460.31, + "duration": 0.0, + "text": "will notice that there are more" + }, + { + "start": 1460.32, + "duration": 0.0, + "text": "will notice that there are more parameters<00:24:21.040> for<00:24:21.200> the<00:24:21.320> gated<00:24:22.240> uh<00:24:22.320> model," + }, + { + "start": 1462.669, + "duration": 0.0, + "text": "parameters for the gated uh model," + }, + { + "start": 1462.679, + "duration": 0.0, + "text": "parameters for the gated uh model, right?<00:24:22.880> Cuz<00:24:23.040> I<00:24:23.160> have<00:24:23.520> this<00:24:23.679> parameter<00:24:24.080> of<00:24:24.200> V." + }, + { + "start": 1465.11, + "duration": 0.0, + "text": "right? Cuz I have this parameter of V." + }, + { + "start": 1465.12, + "duration": 0.0, + "text": "right? Cuz I have this parameter of V. And<00:24:25.240> so,<00:24:25.360> if<00:24:25.480> you<00:24:25.560> do<00:24:25.720> a<00:24:25.760> little<00:24:26.000> bit<00:24:26.120> of<00:24:26.240> math," + }, + { + "start": 1466.63, + "duration": 0.0, + "text": "And so, if you do a little bit of math," + }, + { + "start": 1466.64, + "duration": 0.0, + "text": "And so, if you do a little bit of math, right?<00:24:27.000> I<00:24:27.080> now<00:24:27.280> have<00:24:27.520> three<00:24:27.960> matrices<00:24:28.800> instead" + }, + { + "start": 1469.15, + "duration": 0.0, + "text": "right? I now have three matrices instead" + }, + { + "start": 1469.16, + "duration": 0.0, + "text": "right? I now have three matrices instead of<00:24:29.360> two<00:24:29.640> matrices," + }, + { + "start": 1471.43, + "duration": 0.0, + "text": "of two matrices," + }, + { + "start": 1471.44, + "duration": 0.0, + "text": "of two matrices, right?<00:24:32.320> What<00:24:32.480> you<00:24:32.600> should<00:24:32.760> do<00:24:32.960> is<00:24:33.080> you<00:24:33.160> should" + }, + { + "start": 1473.31, + "duration": 0.0, + "text": "right? What you should do is you should" + }, + { + "start": 1473.32, + "duration": 0.0, + "text": "right? What you should do is you should maybe<00:24:33.840> use<00:24:34.040> a<00:24:34.160> smaller<00:24:34.840> feed<00:24:35.080> forward" + }, + { + "start": 1475.31, + "duration": 0.0, + "text": "maybe use a smaller feed forward" + }, + { + "start": 1475.32, + "duration": 0.0, + "text": "maybe use a smaller feed forward dimension<00:24:35.960> by<00:24:36.440> a<00:24:36.480> factor<00:24:36.800> of<00:24:36.920> 2/3<00:24:37.880> in<00:24:38.040> order<00:24:38.240> to" + }, + { + "start": 1478.35, + "duration": 0.0, + "text": "dimension by a factor of 2/3 in order to" + }, + { + "start": 1478.36, + "duration": 0.0, + "text": "dimension by a factor of 2/3 in order to keep<00:24:38.600> the<00:24:38.760> total<00:24:39.160> parameter<00:24:39.600> count<00:24:39.840> the<00:24:39.960> same," + }, + { + "start": 1480.39, + "duration": 0.0, + "text": "keep the total parameter count the same," + }, + { + "start": 1480.4, + "duration": 0.0, + "text": "keep the total parameter count the same, right?<00:24:40.560> So,<00:24:40.640> this<00:24:40.840> is<00:24:41.000> roughly<00:24:41.440> the<00:24:41.560> idea<00:24:42.000> of," + }, + { + "start": 1482.55, + "duration": 0.0, + "text": "right? So, this is roughly the idea of," + }, + { + "start": 1482.56, + "duration": 0.0, + "text": "right? So, this is roughly the idea of, \"Well,<00:24:42.800> I<00:24:42.880> want<00:24:43.040> to<00:24:43.120> keep<00:24:43.280> the<00:24:43.400> same<00:24:43.800> number<00:24:44.080> of" + }, + { + "start": 1484.23, + "duration": 0.0, + "text": "\"Well, I want to keep the same number of" + }, + { + "start": 1484.24, + "duration": 0.0, + "text": "\"Well, I want to keep the same number of total<00:24:44.520> parameters<00:24:45.120> as<00:24:45.240> my<00:24:45.400> original<00:24:45.840> MLP,<00:24:46.679> but" + }, + { + "start": 1486.79, + "duration": 0.0, + "text": "total parameters as my original MLP, but" + }, + { + "start": 1486.8, + "duration": 0.0, + "text": "total parameters as my original MLP, but I<00:24:46.880> now<00:24:47.120> want<00:24:47.280> to<00:24:47.320> make<00:24:47.520> it<00:24:47.640> gated,<00:24:48.360> so<00:24:48.440> I'm" + }, + { + "start": 1488.51, + "duration": 0.0, + "text": "I now want to make it gated, so I'm" + }, + { + "start": 1488.52, + "duration": 0.0, + "text": "I now want to make it gated, so I'm going<00:24:48.720> to<00:24:48.800> make<00:24:49.040> the<00:24:49.240> feed<00:24:49.520> forward" + }, + { + "start": 1489.75, + "duration": 0.0, + "text": "going to make the feed forward" + }, + { + "start": 1489.76, + "duration": 0.0, + "text": "going to make the feed forward dimension,<00:24:50.240> which<00:24:50.440> is<00:24:50.560> the<00:24:50.720> output<00:24:51.040> dimension" + }, + { + "start": 1491.43, + "duration": 0.0, + "text": "dimension, which is the output dimension" + }, + { + "start": 1491.44, + "duration": 0.0, + "text": "dimension, which is the output dimension of<00:24:51.560> this<00:24:51.800> W,<00:24:52.560> a<00:24:52.640> little<00:24:53.000> bit<00:24:53.240> smaller<00:24:53.679> by<00:24:53.840> 2/3," + }, + { + "start": 1494.59, + "duration": 0.0, + "text": "of this W, a little bit smaller by 2/3," + }, + { + "start": 1494.6, + "duration": 0.0, + "text": "of this W, a little bit smaller by 2/3, right?\"<00:24:54.800> So,<00:24:54.920> this<00:24:55.080> is<00:24:55.200> a<00:24:55.240> general<00:24:55.760> rule<00:24:55.920> of" + }, + { + "start": 1496.03, + "duration": 0.0, + "text": "right?\" So, this is a general rule of" + }, + { + "start": 1496.04, + "duration": 0.0, + "text": "right?\" So, this is a general rule of thumb<00:24:56.200> that<00:24:56.360> people<00:24:56.560> have<00:24:56.720> followed,<00:24:57.440> but" + }, + { + "start": 1497.55, + "duration": 0.0, + "text": "thumb that people have followed, but" + }, + { + "start": 1497.56, + "duration": 0.0, + "text": "thumb that people have followed, but it's<00:24:57.679> not<00:24:57.920> really<00:24:58.160> an<00:24:58.360> iron<00:24:58.800> rule." + }, + { + "start": 1500.35, + "duration": 0.0, + "text": "it's not really an iron rule." + }, + { + "start": 1500.36, + "duration": 0.0, + "text": "it's not really an iron rule. Um" + }, + { + "start": 1501.79, + "duration": 0.0, + "text": "Um" + }, + { + "start": 1501.8, + "duration": 0.0, + "text": "Um You<00:25:01.880> know,<00:25:02.000> the<00:25:02.120> original<00:25:02.560> Noam<00:25:02.800> Shazeer" + }, + { + "start": 1503.23, + "duration": 0.0, + "text": "You know, the original Noam Shazeer" + }, + { + "start": 1503.24, + "duration": 0.0, + "text": "You know, the original Noam Shazeer paper<00:25:03.720> that,<00:25:04.120> you<00:25:04.200> know,<00:25:04.440> proposed<00:25:04.960> this," + }, + { + "start": 1505.87, + "duration": 0.0, + "text": "paper that, you know, proposed this," + }, + { + "start": 1505.88, + "duration": 0.0, + "text": "paper that, you know, proposed this, um" + }, + { + "start": 1506.47, + "duration": 0.0, + "text": "um" + }, + { + "start": 1506.48, + "duration": 0.0, + "text": "um had<00:25:06.640> some,<00:25:07.560> you<00:25:07.640> know,<00:25:07.840> very<00:25:08.320> small<00:25:09.280> deltas" + }, + { + "start": 1509.75, + "duration": 0.0, + "text": "had some, you know, very small deltas" + }, + { + "start": 1509.76, + "duration": 0.0, + "text": "had some, you know, very small deltas originally,<00:25:10.840> but<00:25:11.320> they're<00:25:11.520> consistent" + }, + { + "start": 1512.11, + "duration": 0.0, + "text": "originally, but they're consistent" + }, + { + "start": 1512.12, + "duration": 0.0, + "text": "originally, but they're consistent deltas,<00:25:12.880> and<00:25:13.000> I<00:25:13.040> think<00:25:13.280> to<00:25:13.400> his<00:25:13.640> credit,<00:25:14.720> um" + }, + { + "start": 1515.35, + "duration": 0.0, + "text": "deltas, and I think to his credit, um" + }, + { + "start": 1515.36, + "duration": 0.0, + "text": "deltas, and I think to his credit, um I<00:25:15.440> think<00:25:15.640> a<00:25:15.679> lot<00:25:15.920> of<00:25:15.960> his<00:25:16.120> papers<00:25:16.600> have<00:25:16.840> these" + }, + { + "start": 1516.99, + "duration": 0.0, + "text": "I think a lot of his papers have these" + }, + { + "start": 1517.0, + "duration": 0.0, + "text": "I think a lot of his papers have these like<00:25:17.240> error<00:25:17.560> bar<00:25:17.800> assessments<00:25:18.320> of<00:25:18.480> like" + }, + { + "start": 1518.669, + "duration": 0.0, + "text": "like error bar assessments of like" + }, + { + "start": 1518.679, + "duration": 0.0, + "text": "like error bar assessments of like training<00:25:19.160> multiple<00:25:19.560> replicates<00:25:20.160> and" + }, + { + "start": 1520.27, + "duration": 0.0, + "text": "training multiple replicates and" + }, + { + "start": 1520.28, + "duration": 0.0, + "text": "training multiple replicates and checking<00:25:20.560> to<00:25:20.679> see<00:25:20.880> if<00:25:21.000> they're<00:25:21.160> better.<00:25:22.080> Um" + }, + { + "start": 1522.59, + "duration": 0.0, + "text": "checking to see if they're better. Um" + }, + { + "start": 1522.6, + "duration": 0.0, + "text": "checking to see if they're better. Um and<00:25:22.720> if<00:25:22.840> you<00:25:22.920> look,<00:25:23.200> the<00:25:23.320> GLU<00:25:23.920> variants<00:25:24.880> are" + }, + { + "start": 1525.43, + "duration": 0.0, + "text": "and if you look, the GLU variants are" + }, + { + "start": 1525.44, + "duration": 0.0, + "text": "and if you look, the GLU variants are almost<00:25:25.840> always<00:25:26.200> consistently<00:25:26.880> better<00:25:27.520> than" + }, + { + "start": 1527.669, + "duration": 0.0, + "text": "almost always consistently better than" + }, + { + "start": 1527.679, + "duration": 0.0, + "text": "almost always consistently better than the<00:25:27.800> non-GLU<00:25:28.720> variants.<00:25:29.200> And<00:25:29.280> this<00:25:29.400> is<00:25:29.480> a" + }, + { + "start": 1529.55, + "duration": 0.0, + "text": "the non-GLU variants. And this is a" + }, + { + "start": 1529.56, + "duration": 0.0, + "text": "the non-GLU variants. And this is a parameter<00:25:30.040> matched<00:25:30.360> comparison<00:25:30.920> because<00:25:31.840> um" + }, + { + "start": 1532.15, + "duration": 0.0, + "text": "parameter matched comparison because um" + }, + { + "start": 1532.16, + "duration": 0.0, + "text": "parameter matched comparison because um Noam<00:25:32.360> Shazeer<00:25:32.679> is<00:25:32.760> always<00:25:33.040> doing<00:25:33.280> this<00:25:33.480> 2/3" + }, + { + "start": 1534.23, + "duration": 0.0, + "text": "Noam Shazeer is always doing this 2/3" + }, + { + "start": 1534.24, + "duration": 0.0, + "text": "Noam Shazeer is always doing this 2/3 adjustment<00:25:35.160> to<00:25:35.280> make<00:25:35.480> sure<00:25:35.640> that<00:25:35.880> all<00:25:36.040> of<00:25:36.120> the" + }, + { + "start": 1536.19, + "duration": 0.0, + "text": "adjustment to make sure that all of the" + }, + { + "start": 1536.2, + "duration": 0.0, + "text": "adjustment to make sure that all of the models<00:25:36.600> have<00:25:36.720> the<00:25:36.840> total<00:25:37.240> same<00:25:37.560> total<00:25:37.880> number" + }, + { + "start": 1538.83, + "duration": 0.0, + "text": "models have the total same total number" + }, + { + "start": 1538.84, + "duration": 0.0, + "text": "models have the total same total number um<00:25:39.080> of<00:25:39.240> parameters." + }, + { + "start": 1540.83, + "duration": 0.0, + "text": "um of parameters." + }, + { + "start": 1540.84, + "duration": 0.0, + "text": "um of parameters. So,<00:25:40.880> this<00:25:41.080> is<00:25:41.200> quite<00:25:41.480> nice.<00:25:42.160> It's<00:25:42.640> in<00:25:42.800> some" + }, + { + "start": 1542.99, + "duration": 0.0, + "text": "So, this is quite nice. It's in some" + }, + { + "start": 1543.0, + "duration": 0.0, + "text": "So, this is quite nice. It's in some ways<00:25:43.160> a<00:25:43.240> free<00:25:43.640> win.<00:25:44.240> Um<00:25:44.360> almost<00:25:44.760> everyone<00:25:45.720> uses" + }, + { + "start": 1546.07, + "duration": 0.0, + "text": "ways a free win. Um almost everyone uses" + }, + { + "start": 1546.08, + "duration": 0.0, + "text": "ways a free win. Um almost everyone uses a<00:25:46.120> GLU.<00:25:46.720> There<00:25:46.920> have<00:25:47.400> been<00:25:47.800> other<00:25:48.520> sort<00:25:48.760> of" + }, + { + "start": 1548.87, + "duration": 0.0, + "text": "a GLU. There have been other sort of" + }, + { + "start": 1548.88, + "duration": 0.0, + "text": "a GLU. There have been other sort of more<00:25:49.080> controlled<00:25:49.560> systematic<00:25:50.040> comparisons." + }, + { + "start": 1550.55, + "duration": 0.0, + "text": "more controlled systematic comparisons." + }, + { + "start": 1550.56, + "duration": 0.0, + "text": "more controlled systematic comparisons. This<00:25:50.720> is<00:25:51.120> uh<00:25:51.240> the<00:25:51.360> same<00:25:51.600> paper<00:25:51.920> I<00:25:52.000> was<00:25:52.160> talking" + }, + { + "start": 1552.39, + "duration": 0.0, + "text": "This is uh the same paper I was talking" + }, + { + "start": 1552.4, + "duration": 0.0, + "text": "This is uh the same paper I was talking about<00:25:52.520> before,<00:25:52.760> Noam<00:25:53.040> et<00:25:53.120> al.<00:25:53.240> in<00:25:53.360> 2020." + }, + { + "start": 1554.75, + "duration": 0.0, + "text": "about before, Noam et al. in 2020." + }, + { + "start": 1554.76, + "duration": 0.0, + "text": "about before, Noam et al. in 2020. Um<00:25:55.040> Google<00:25:55.800> actually<00:25:56.080> in<00:25:56.160> the<00:25:56.240> 2020s<00:25:57.440> did" + }, + { + "start": 1557.83, + "duration": 0.0, + "text": "Um Google actually in the 2020s did" + }, + { + "start": 1557.84, + "duration": 0.0, + "text": "Um Google actually in the 2020s did quite<00:25:58.240> a<00:25:58.280> few<00:25:58.560> nice<00:25:59.200> large-scale" + }, + { + "start": 1559.99, + "duration": 0.0, + "text": "quite a few nice large-scale" + }, + { + "start": 1560.0, + "duration": 0.0, + "text": "quite a few nice large-scale architecture<00:26:00.480> comparison<00:26:01.040> papers,<00:26:01.920> um" + }, + { + "start": 1562.03, + "duration": 0.0, + "text": "architecture comparison papers, um" + }, + { + "start": 1562.04, + "duration": 0.0, + "text": "architecture comparison papers, um although<00:26:02.360> with<00:26:02.600> a<00:26:02.679> T5<00:26:03.160> architecture<00:26:03.640> and<00:26:03.760> not" + }, + { + "start": 1564.55, + "duration": 0.0, + "text": "although with a T5 architecture and not" + }, + { + "start": 1564.56, + "duration": 0.0, + "text": "although with a T5 architecture and not uh<00:26:04.720> autoregressive<00:26:05.800> uh<00:26:05.880> language<00:26:06.240> model.<00:26:07.120> Um" + }, + { + "start": 1567.31, + "duration": 0.0, + "text": "uh autoregressive uh language model. Um" + }, + { + "start": 1567.32, + "duration": 0.0, + "text": "uh autoregressive uh language model. Um and<00:26:07.440> they,<00:26:07.760> you<00:26:07.840> know,<00:26:08.040> basically" + }, + { + "start": 1568.39, + "duration": 0.0, + "text": "and they, you know, basically" + }, + { + "start": 1568.4, + "duration": 0.0, + "text": "and they, you know, basically comprehensively<00:26:09.040> compare<00:26:09.400> things<00:26:09.640> like" + }, + { + "start": 1569.83, + "duration": 0.0, + "text": "comprehensively compare things like" + }, + { + "start": 1569.84, + "duration": 0.0, + "text": "comprehensively compare things like GLUs,<00:26:10.240> and<00:26:10.360> you<00:26:10.440> see<00:26:10.560> once<00:26:10.760> again,<00:26:11.440> um<00:26:11.520> if<00:26:11.640> we" + }, + { + "start": 1571.71, + "duration": 0.0, + "text": "GLUs, and you see once again, um if we" + }, + { + "start": 1571.72, + "duration": 0.0, + "text": "GLUs, and you see once again, um if we look<00:26:11.880> at<00:26:11.960> the<00:26:12.040> SwiGLU<00:26:12.840> or<00:26:12.960> the<00:26:13.080> GeGLU<00:26:13.679> or<00:26:13.800> the" + }, + { + "start": 1573.95, + "duration": 0.0, + "text": "look at the SwiGLU or the GeGLU or the" + }, + { + "start": 1573.96, + "duration": 0.0, + "text": "look at the SwiGLU or the GeGLU or the GLUs<00:26:14.800> in<00:26:14.960> general,<00:26:15.679> they<00:26:15.840> do<00:26:15.960> significantly" + }, + { + "start": 1576.63, + "duration": 0.0, + "text": "GLUs in general, they do significantly" + }, + { + "start": 1576.64, + "duration": 0.0, + "text": "GLUs in general, they do significantly better<00:26:16.880> at<00:26:17.000> loss<00:26:17.560> or<00:26:17.720> the<00:26:17.880> other<00:26:18.080> downstream" + }, + { + "start": 1578.55, + "duration": 0.0, + "text": "better at loss or the other downstream" + }, + { + "start": 1578.56, + "duration": 0.0, + "text": "better at loss or the other downstream metrics,<00:26:19.040> right?" + }, + { + "start": 1580.19, + "duration": 0.0, + "text": "metrics, right?" + }, + { + "start": 1580.2, + "duration": 0.0, + "text": "metrics, right? Fairly<00:26:20.480> compelling<00:26:20.920> on<00:26:21.040> paper<00:26:21.560> uh<00:26:21.679> on<00:26:21.800> these" + }, + { + "start": 1581.95, + "duration": 0.0, + "text": "Fairly compelling on paper uh on these" + }, + { + "start": 1581.96, + "duration": 0.0, + "text": "Fairly compelling on paper uh on these papers,<00:26:22.880> also<00:26:23.200> clear<00:26:23.760> from<00:26:24.000> now<00:26:24.320> a<00:26:24.360> lot<00:26:24.640> of" + }, + { + "start": 1584.71, + "duration": 0.0, + "text": "papers, also clear from now a lot of" + }, + { + "start": 1584.72, + "duration": 0.0, + "text": "papers, also clear from now a lot of model<00:26:24.960> training<00:26:25.360> runs<00:26:26.040> that<00:26:26.400> SwiGLU<00:26:27.280> and<00:26:27.440> GLU" + }, + { + "start": 1588.31, + "duration": 0.0, + "text": "model training runs that SwiGLU and GLU" + }, + { + "start": 1588.32, + "duration": 0.0, + "text": "model training runs that SwiGLU and GLU are<00:26:28.679> good,<00:26:29.200> right?" + }, + { + "start": 1590.35, + "duration": 0.0, + "text": "are good, right?" + }, + { + "start": 1590.36, + "duration": 0.0, + "text": "are good, right? So,<00:26:30.440> there's<00:26:30.640> a<00:26:30.679> lot<00:26:30.960> of<00:26:31.080> variations<00:26:31.720> in" + }, + { + "start": 1591.87, + "duration": 0.0, + "text": "So, there's a lot of variations in" + }, + { + "start": 1591.88, + "duration": 0.0, + "text": "So, there's a lot of variations in gating,<00:26:32.320> but<00:26:32.480> really<00:26:32.679> the<00:26:32.840> important<00:26:33.440> single" + }, + { + "start": 1593.87, + "duration": 0.0, + "text": "gating, but really the important single" + }, + { + "start": 1593.88, + "duration": 0.0, + "text": "gating, but really the important single axis<00:26:34.400> to<00:26:34.560> know<00:26:35.280> is<00:26:35.440> that<00:26:35.720> gating<00:26:36.560> uh<00:26:36.679> for<00:26:36.840> these" + }, + { + "start": 1597.27, + "duration": 0.0, + "text": "axis to know is that gating uh for these" + }, + { + "start": 1597.28, + "duration": 0.0, + "text": "axis to know is that gating uh for these nonlinearities<00:26:38.000> is<00:26:38.120> actually<00:26:38.400> quite" + }, + { + "start": 1598.75, + "duration": 0.0, + "text": "nonlinearities is actually quite" + }, + { + "start": 1598.76, + "duration": 0.0, + "text": "nonlinearities is actually quite important,<00:26:39.360> gives<00:26:39.600> you" + }, + { + "start": 1600.51, + "duration": 0.0, + "text": "important, gives you" + }, + { + "start": 1600.52, + "duration": 0.0, + "text": "important, gives you uh<00:26:40.600> nice<00:26:41.000> boost<00:26:41.440> without<00:26:41.760> much<00:26:41.960> of<00:26:42.040> a" + }, + { + "start": 1602.07, + "duration": 0.0, + "text": "uh nice boost without much of a" + }, + { + "start": 1602.08, + "duration": 0.0, + "text": "uh nice boost without much of a computational<00:26:42.720> cost.<00:26:43.720> Um<00:26:44.480> you<00:26:44.560> know,<00:26:44.760> that's" + }, + { + "start": 1604.95, + "duration": 0.0, + "text": "computational cost. Um you know, that's" + }, + { + "start": 1604.96, + "duration": 0.0, + "text": "computational cost. Um you know, that's not<00:26:45.080> to<00:26:45.200> say<00:26:45.360> that<00:26:45.760> gated<00:26:46.080> linear<00:26:46.320> units<00:26:46.520> are" + }, + { + "start": 1606.63, + "duration": 0.0, + "text": "not to say that gated linear units are" + }, + { + "start": 1606.64, + "duration": 0.0, + "text": "not to say that gated linear units are necessary.<00:26:47.360> I<00:26:47.400> mean,<00:26:47.560> GPT-3<00:26:48.760> was<00:26:48.960> that.<00:26:49.520> Um<00:26:49.640> I" + }, + { + "start": 1609.669, + "duration": 0.0, + "text": "necessary. I mean, GPT-3 was that. Um I" + }, + { + "start": 1609.679, + "duration": 0.0, + "text": "necessary. I mean, GPT-3 was that. Um I think<00:26:49.840> the<00:26:50.000> NeMo<00:26:50.240> Tron<00:26:50.520> 340B<00:26:51.480> model<00:26:51.800> used<00:26:51.960> a" + }, + { + "start": 1611.99, + "duration": 0.0, + "text": "think the NeMo Tron 340B model used a" + }, + { + "start": 1612.0, + "duration": 0.0, + "text": "think the NeMo Tron 340B model used a squared<00:26:52.440> ReLU,<00:26:52.800> which<00:26:52.960> is<00:26:53.040> a<00:26:53.320> kind<00:26:53.520> of<00:26:53.600> a<00:26:53.679> crazy" + }, + { + "start": 1614.03, + "duration": 0.0, + "text": "squared ReLU, which is a kind of a crazy" + }, + { + "start": 1614.04, + "duration": 0.0, + "text": "squared ReLU, which is a kind of a crazy choice,<00:26:54.440> but<00:26:54.679> that<00:26:54.840> works,<00:26:55.160> too.<00:26:56.000> Um<00:26:56.360> both<00:26:56.600> of" + }, + { + "start": 1616.669, + "duration": 0.0, + "text": "choice, but that works, too. Um both of" + }, + { + "start": 1616.679, + "duration": 0.0, + "text": "choice, but that works, too. Um both of these<00:26:56.840> models<00:26:57.280> are<00:26:57.400> perfectly<00:26:57.840> performant," + }, + { + "start": 1618.63, + "duration": 0.0, + "text": "these models are perfectly performant," + }, + { + "start": 1618.64, + "duration": 0.0, + "text": "these models are perfectly performant, but<00:26:58.760> it's<00:26:58.880> actually<00:26:59.160> quite<00:26:59.600> rare<00:27:00.360> to<00:27:00.480> see" + }, + { + "start": 1620.669, + "duration": 0.0, + "text": "but it's actually quite rare to see" + }, + { + "start": 1620.679, + "duration": 0.0, + "text": "but it's actually quite rare to see anything<00:27:01.040> that's<00:27:01.200> not<00:27:01.520> trained<00:27:01.880> on<00:27:02.000> a<00:27:02.040> gated" + }, + { + "start": 1622.75, + "duration": 0.0, + "text": "anything that's not trained on a gated" + }, + { + "start": 1622.76, + "duration": 0.0, + "text": "anything that's not trained on a gated linear<00:27:03.160> unit,<00:27:03.640> right?<00:27:03.760> So,<00:27:03.880> evidence<00:27:04.240> is" + }, + { + "start": 1624.43, + "duration": 0.0, + "text": "linear unit, right? So, evidence is" + }, + { + "start": 1624.44, + "duration": 0.0, + "text": "linear unit, right? So, evidence is pointing<00:27:04.720> towards<00:27:05.000> consistent<00:27:05.480> gains<00:27:06.160> on" + }, + { + "start": 1626.35, + "duration": 0.0, + "text": "pointing towards consistent gains on" + }, + { + "start": 1626.36, + "duration": 0.0, + "text": "pointing towards consistent gains on using<00:27:06.720> these<00:27:07.360> gating<00:27:07.720> tricks." + }, + { + "start": 1629.43, + "duration": 0.0, + "text": "using these gating tricks." + }, + { + "start": 1629.44, + "duration": 0.0, + "text": "using these gating tricks. Okay." + }, + { + "start": 1630.39, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 1630.4, + "duration": 0.0, + "text": "Okay. So,<00:27:11.280> those<00:27:11.600> are<00:27:11.720> I<00:27:11.800> think<00:27:12.080> the<00:27:12.360> the<00:27:12.480> more" + }, + { + "start": 1632.669, + "duration": 0.0, + "text": "So, those are I think the the more" + }, + { + "start": 1632.679, + "duration": 0.0, + "text": "So, those are I think the the more consensus<00:27:13.440> choices<00:27:14.280> for<00:27:14.440> things<00:27:14.679> that<00:27:14.800> we<00:27:14.920> can" + }, + { + "start": 1635.07, + "duration": 0.0, + "text": "consensus choices for things that we can" + }, + { + "start": 1635.08, + "duration": 0.0, + "text": "consensus choices for things that we can do<00:27:15.240> in<00:27:15.360> architecture.<00:27:16.560> Um<00:27:16.840> now,<00:27:17.000> this<00:27:17.280> one<00:27:17.520> I" + }, + { + "start": 1637.59, + "duration": 0.0, + "text": "do in architecture. Um now, this one I" + }, + { + "start": 1637.6, + "duration": 0.0, + "text": "do in architecture. Um now, this one I think<00:27:17.760> is<00:27:17.880> a<00:27:17.920> really<00:27:18.160> fun<00:27:18.440> idea,<00:27:19.040> but<00:27:19.200> one<00:27:19.400> that" + }, + { + "start": 1639.669, + "duration": 0.0, + "text": "think is a really fun idea, but one that" + }, + { + "start": 1639.679, + "duration": 0.0, + "text": "think is a really fun idea, but one that I<00:27:19.760> think<00:27:20.080> now<00:27:20.360> the<00:27:20.520> test<00:27:20.800> of<00:27:20.920> time<00:27:21.160> has<00:27:21.320> shown" + }, + { + "start": 1642.07, + "duration": 0.0, + "text": "I think now the test of time has shown" + }, + { + "start": 1642.08, + "duration": 0.0, + "text": "I think now the test of time has shown maybe<00:27:22.360> is<00:27:22.480> not<00:27:22.880> quite<00:27:23.120> as<00:27:23.280> good<00:27:23.520> or<00:27:23.640> maybe<00:27:23.880> not" + }, + { + "start": 1644.11, + "duration": 0.0, + "text": "maybe is not quite as good or maybe not" + }, + { + "start": 1644.12, + "duration": 0.0, + "text": "maybe is not quite as good or maybe not as<00:27:24.280> popular<00:27:24.760> of<00:27:24.840> an<00:27:24.960> idea." + }, + { + "start": 1646.03, + "duration": 0.0, + "text": "as popular of an idea." + }, + { + "start": 1646.04, + "duration": 0.0, + "text": "as popular of an idea. Um<00:27:26.360> normally,<00:27:27.280> we<00:27:27.400> do<00:27:27.560> our<00:27:27.640> transformer" + }, + { + "start": 1648.19, + "duration": 0.0, + "text": "Um normally, we do our transformer" + }, + { + "start": 1648.2, + "duration": 0.0, + "text": "Um normally, we do our transformer blocks<00:27:28.760> serially,<00:27:29.360> right?<00:27:29.560> We<00:27:29.679> compute<00:27:30.120> our" + }, + { + "start": 1650.23, + "duration": 0.0, + "text": "blocks serially, right? We compute our" + }, + { + "start": 1650.24, + "duration": 0.0, + "text": "blocks serially, right? We compute our attention,<00:27:31.200> then<00:27:31.360> we<00:27:31.480> compute<00:27:31.840> the<00:27:31.960> MLP," + }, + { + "start": 1652.51, + "duration": 0.0, + "text": "attention, then we compute the MLP," + }, + { + "start": 1652.52, + "duration": 0.0, + "text": "attention, then we compute the MLP, right?<00:27:32.720> One<00:27:32.920> after<00:27:33.200> the<00:27:33.360> other." + }, + { + "start": 1654.19, + "duration": 0.0, + "text": "right? One after the other." + }, + { + "start": 1654.2, + "duration": 0.0, + "text": "right? One after the other. Um<00:27:34.720> if<00:27:34.880> you're<00:27:35.000> very<00:27:35.240> systems-minded,<00:27:36.240> you" + }, + { + "start": 1656.31, + "duration": 0.0, + "text": "Um if you're very systems-minded, you" + }, + { + "start": 1656.32, + "duration": 0.0, + "text": "Um if you're very systems-minded, you might<00:27:36.560> say,<00:27:36.800> \"Well,<00:27:37.000> this<00:27:37.160> introduces<00:27:37.640> a" + }, + { + "start": 1657.71, + "duration": 0.0, + "text": "might say, \"Well, this introduces a" + }, + { + "start": 1657.72, + "duration": 0.0, + "text": "might say, \"Well, this introduces a bottleneck,<00:27:38.400> right?<00:27:38.640> I<00:27:38.679> have<00:27:38.920> to<00:27:39.040> wait<00:27:39.400> for" + }, + { + "start": 1659.51, + "duration": 0.0, + "text": "bottleneck, right? I have to wait for" + }, + { + "start": 1659.52, + "duration": 0.0, + "text": "bottleneck, right? I have to wait for the<00:27:39.600> computation<00:27:40.200> of<00:27:40.360> one<00:27:40.960> to<00:27:41.120> do<00:27:41.280> the<00:27:41.440> other." + }, + { + "start": 1661.99, + "duration": 0.0, + "text": "the computation of one to do the other." + }, + { + "start": 1662.0, + "duration": 0.0, + "text": "the computation of one to do the other. If<00:27:42.160> they<00:27:42.280> were<00:27:42.440> instead<00:27:42.720> in<00:27:42.840> parallel,<00:27:43.600> I" + }, + { + "start": 1663.669, + "duration": 0.0, + "text": "If they were instead in parallel, I" + }, + { + "start": 1663.679, + "duration": 0.0, + "text": "If they were instead in parallel, I could<00:27:43.880> bring<00:27:44.120> to<00:27:44.240> bear<00:27:44.520> some<00:27:44.760> new<00:27:45.040> and<00:27:45.160> cool" + }, + { + "start": 1665.47, + "duration": 0.0, + "text": "could bring to bear some new and cool" + }, + { + "start": 1665.48, + "duration": 0.0, + "text": "could bring to bear some new and cool systems<00:27:45.960> optimizations,<00:27:46.679> potentially," + }, + { + "start": 1667.19, + "duration": 0.0, + "text": "systems optimizations, potentially," + }, + { + "start": 1667.2, + "duration": 0.0, + "text": "systems optimizations, potentially, right?\"<00:27:47.920> So,<00:27:48.080> you<00:27:48.160> might<00:27:48.360> ask,<00:27:48.600> \"Could<00:27:48.720> we" + }, + { + "start": 1668.87, + "duration": 0.0, + "text": "right?\" So, you might ask, \"Could we" + }, + { + "start": 1668.88, + "duration": 0.0, + "text": "right?\" So, you might ask, \"Could we parallelize<00:27:49.560> the<00:27:49.640> transformer<00:27:50.240> block?\"" + }, + { + "start": 1671.79, + "duration": 0.0, + "text": "parallelize the transformer block?\"" + }, + { + "start": 1671.8, + "duration": 0.0, + "text": "parallelize the transformer block?\" And<00:27:52.920> um<00:27:53.280> this<00:27:53.440> was<00:27:53.560> originally<00:27:54.160> an<00:27:54.320> idea<00:27:54.960> that" + }, + { + "start": 1675.15, + "duration": 0.0, + "text": "And um this was originally an idea that" + }, + { + "start": 1675.16, + "duration": 0.0, + "text": "And um this was originally an idea that was<00:27:55.360> in<00:27:55.560> GPT-J,<00:27:56.480> which<00:27:56.640> is<00:27:56.760> the<00:27:56.840> open-source" + }, + { + "start": 1677.83, + "duration": 0.0, + "text": "was in GPT-J, which is the open-source" + }, + { + "start": 1677.84, + "duration": 0.0, + "text": "was in GPT-J, which is the open-source attempted<00:27:58.560> replication<00:27:59.200> of<00:27:59.320> GPT-3." + }, + { + "start": 1680.91, + "duration": 0.0, + "text": "attempted replication of GPT-3." + }, + { + "start": 1680.92, + "duration": 0.0, + "text": "attempted replication of GPT-3. Um<00:28:01.800> and<00:28:02.280> kind<00:28:02.480> of<00:28:02.880> very<00:28:03.200> interestingly,<00:28:03.880> I" + }, + { + "start": 1683.91, + "duration": 0.0, + "text": "Um and kind of very interestingly, I" + }, + { + "start": 1683.92, + "duration": 0.0, + "text": "Um and kind of very interestingly, I think<00:28:04.120> GPT-J<00:28:04.840> has<00:28:05.000> been<00:28:05.160> surprisingly" + }, + { + "start": 1686.03, + "duration": 0.0, + "text": "think GPT-J has been surprisingly" + }, + { + "start": 1686.04, + "duration": 0.0, + "text": "think GPT-J has been surprisingly influential<00:28:07.320> in<00:28:07.840> sort<00:28:08.080> of<00:28:08.280> propagating<00:28:09.040> a<00:28:09.080> lot" + }, + { + "start": 1689.31, + "duration": 0.0, + "text": "influential in sort of propagating a lot" + }, + { + "start": 1689.32, + "duration": 0.0, + "text": "influential in sort of propagating a lot of<00:28:09.440> ideas.<00:28:10.040> I<00:28:10.080> mean,<00:28:10.240> PaLM<00:28:10.600> as<00:28:10.800> well.<00:28:11.120> Google" + }, + { + "start": 1692.27, + "duration": 0.0, + "text": "of ideas. I mean, PaLM as well. Google" + }, + { + "start": 1692.28, + "duration": 0.0, + "text": "of ideas. I mean, PaLM as well. Google um<00:28:12.560> is<00:28:12.720> actually<00:28:12.920> surprisingly<00:28:13.600> bold<00:28:14.040> with" + }, + { + "start": 1694.19, + "duration": 0.0, + "text": "um is actually surprisingly bold with" + }, + { + "start": 1694.2, + "duration": 0.0, + "text": "um is actually surprisingly bold with the<00:28:14.320> architectures<00:28:14.840> that<00:28:15.000> they<00:28:15.120> do.<00:28:15.880> Um<00:28:16.040> but" + }, + { + "start": 1696.15, + "duration": 0.0, + "text": "the architectures that they do. Um but" + }, + { + "start": 1696.16, + "duration": 0.0, + "text": "the architectures that they do. Um but the<00:28:16.240> description<00:28:17.200> in<00:28:17.800> uh<00:28:17.880> PaLM,<00:28:18.679> uh<00:28:18.720> which<00:28:18.880> you" + }, + { + "start": 1698.95, + "duration": 0.0, + "text": "the description in uh PaLM, uh which you" + }, + { + "start": 1698.96, + "duration": 0.0, + "text": "the description in uh PaLM, uh which you can<00:28:19.080> see<00:28:19.240> in<00:28:19.360> their<00:28:19.520> report,<00:28:20.080> is<00:28:20.280> kind<00:28:20.480> of<00:28:20.560> the" + }, + { + "start": 1700.63, + "duration": 0.0, + "text": "can see in their report, is kind of the" + }, + { + "start": 1700.64, + "duration": 0.0, + "text": "can see in their report, is kind of the following.<00:28:21.000> Instead<00:28:21.320> of<00:28:21.440> nesting<00:28:22.040> this," + }, + { + "start": 1702.31, + "duration": 0.0, + "text": "following. Instead of nesting this," + }, + { + "start": 1702.32, + "duration": 0.0, + "text": "following. Instead of nesting this, which<00:28:22.480> is<00:28:22.560> the<00:28:22.679> sequential<00:28:23.200> format<00:28:23.480> at<00:28:23.520> the" + }, + { + "start": 1703.59, + "duration": 0.0, + "text": "which is the sequential format at the" + }, + { + "start": 1703.6, + "duration": 0.0, + "text": "which is the sequential format at the top,<00:28:24.200> you<00:28:24.280> know,<00:28:24.360> you're<00:28:24.480> just<00:28:24.679> going<00:28:24.800> to<00:28:24.960> add" + }, + { + "start": 1705.27, + "duration": 0.0, + "text": "top, you know, you're just going to add" + }, + { + "start": 1705.28, + "duration": 0.0, + "text": "top, you know, you're just going to add together<00:28:26.080> the<00:28:26.240> output<00:28:26.520> of<00:28:26.640> the<00:28:26.720> MLP<00:28:27.200> and" + }, + { + "start": 1707.31, + "duration": 0.0, + "text": "together the output of the MLP and" + }, + { + "start": 1707.32, + "duration": 0.0, + "text": "together the output of the MLP and attention<00:28:27.800> layer,<00:28:28.320> and<00:28:28.400> just<00:28:28.640> add<00:28:28.919> both<00:28:29.159> of" + }, + { + "start": 1709.23, + "duration": 0.0, + "text": "attention layer, and just add both of" + }, + { + "start": 1709.24, + "duration": 0.0, + "text": "attention layer, and just add both of those<00:28:29.480> back<00:28:30.040> into<00:28:30.280> the<00:28:30.360> residual<00:28:30.880> stream." + }, + { + "start": 1712.23, + "duration": 0.0, + "text": "those back into the residual stream." + }, + { + "start": 1712.24, + "duration": 0.0, + "text": "those back into the residual stream. Um<00:28:32.840> if<00:28:32.960> you<00:28:33.120> implement<00:28:33.560> this<00:28:33.760> right,<00:28:34.360> you<00:28:34.480> can" + }, + { + "start": 1714.59, + "duration": 0.0, + "text": "Um if you implement this right, you can" + }, + { + "start": 1714.6, + "duration": 0.0, + "text": "Um if you implement this right, you can actually<00:28:34.880> share<00:28:35.240> a<00:28:35.280> lot<00:28:35.480> of<00:28:35.560> the<00:28:35.600> components." + }, + { + "start": 1716.11, + "duration": 0.0, + "text": "actually share a lot of the components." + }, + { + "start": 1716.12, + "duration": 0.0, + "text": "actually share a lot of the components. Like,<00:28:36.240> you<00:28:36.320> can<00:28:36.400> share<00:28:36.679> the<00:28:36.800> layer<00:28:37.080> norms,<00:28:37.520> you" + }, + { + "start": 1717.63, + "duration": 0.0, + "text": "Like, you can share the layer norms, you" + }, + { + "start": 1717.64, + "duration": 0.0, + "text": "Like, you can share the layer norms, you can<00:28:38.159> fuse<00:28:38.520> the<00:28:38.600> matrix<00:28:38.960> multiplies.<00:28:40.000> This" + }, + { + "start": 1720.149, + "duration": 0.0, + "text": "can fuse the matrix multiplies. This" + }, + { + "start": 1720.159, + "duration": 0.0, + "text": "can fuse the matrix multiplies. This allows<00:28:40.560> you<00:28:40.640> to<00:28:40.760> potentially<00:28:41.440> get<00:28:41.640> additional" + }, + { + "start": 1722.19, + "duration": 0.0, + "text": "allows you to potentially get additional" + }, + { + "start": 1722.2, + "duration": 0.0, + "text": "allows you to potentially get additional systems<00:28:42.679> optimizations.<00:28:44.000> Um" + }, + { + "start": 1725.07, + "duration": 0.0, + "text": "systems optimizations. Um" + }, + { + "start": 1725.08, + "duration": 0.0, + "text": "systems optimizations. Um And<00:28:45.480> I<00:28:45.520> think<00:28:45.800> a<00:28:45.880> lot<00:28:46.120> of<00:28:46.200> the<00:28:46.280> people<00:28:46.600> that" + }, + { + "start": 1726.75, + "duration": 0.0, + "text": "And I think a lot of the people that" + }, + { + "start": 1726.76, + "duration": 0.0, + "text": "And I think a lot of the people that have<00:28:46.960> been<00:28:47.159> influenced<00:28:47.880> by<00:28:48.080> Google,<00:28:48.640> so" + }, + { + "start": 1728.87, + "duration": 0.0, + "text": "have been influenced by Google, so" + }, + { + "start": 1728.88, + "duration": 0.0, + "text": "have been influenced by Google, so Cohere,<00:28:49.520> you<00:28:49.600> know,<00:28:49.760> was<00:28:50.000> founded<00:28:50.360> from<00:28:50.520> one" + }, + { + "start": 1730.63, + "duration": 0.0, + "text": "Cohere, you know, was founded from one" + }, + { + "start": 1730.64, + "duration": 0.0, + "text": "Cohere, you know, was founded from one of<00:28:50.720> the<00:28:50.800> former" + }, + { + "start": 1731.95, + "duration": 0.0, + "text": "of the former" + }, + { + "start": 1731.96, + "duration": 0.0, + "text": "of the former uh<00:28:52.040> transformer<00:28:52.560> authors,<00:28:53.320> they<00:28:53.520> do<00:28:53.679> a<00:28:53.760> lot<00:28:53.960> of" + }, + { + "start": 1734.07, + "duration": 0.0, + "text": "uh transformer authors, they do a lot of" + }, + { + "start": 1734.08, + "duration": 0.0, + "text": "uh transformer authors, they do a lot of Google-inspired<00:28:54.880> optimizations.<00:28:55.480> They" + }, + { + "start": 1735.63, + "duration": 0.0, + "text": "Google-inspired optimizations. They" + }, + { + "start": 1735.64, + "duration": 0.0, + "text": "Google-inspired optimizations. They followed<00:28:56.400> kind<00:28:56.560> of<00:28:56.640> this<00:28:56.800> architecture.<00:28:57.960> Um" + }, + { + "start": 1738.27, + "duration": 0.0, + "text": "followed kind of this architecture. Um" + }, + { + "start": 1738.28, + "duration": 0.0, + "text": "followed kind of this architecture. Um but<00:28:58.400> not<00:28:58.600> very<00:28:58.800> many<00:28:59.040> others.<00:28:59.720> Um<00:28:59.800> this<00:28:59.919> has" + }, + { + "start": 1740.07, + "duration": 0.0, + "text": "but not very many others. Um this has" + }, + { + "start": 1740.08, + "duration": 0.0, + "text": "but not very many others. Um this has been<00:29:00.280> a<00:29:00.679> approach<00:29:01.120> that<00:29:01.240> has<00:29:01.400> really<00:29:01.640> fallen" + }, + { + "start": 1741.95, + "duration": 0.0, + "text": "been a approach that has really fallen" + }, + { + "start": 1741.96, + "duration": 0.0, + "text": "been a approach that has really fallen out<00:29:02.080> of<00:29:02.159> popularity<00:29:02.800> over<00:29:02.919> the<00:29:03.040> past,<00:29:03.360> I" + }, + { + "start": 1743.39, + "duration": 0.0, + "text": "out of popularity over the past, I" + }, + { + "start": 1743.4, + "duration": 0.0, + "text": "out of popularity over the past, I think,<00:29:03.600> 2<00:29:03.800> years.<00:29:04.760> Um<00:29:04.919> I<00:29:04.960> think<00:29:05.200> mainly" + }, + { + "start": 1745.63, + "duration": 0.0, + "text": "think, 2 years. Um I think mainly" + }, + { + "start": 1745.64, + "duration": 0.0, + "text": "think, 2 years. Um I think mainly because<00:29:06.000> optimization<00:29:06.640> of<00:29:06.720> the<00:29:06.840> serial<00:29:07.360> form" + }, + { + "start": 1748.03, + "duration": 0.0, + "text": "because optimization of the serial form" + }, + { + "start": 1748.04, + "duration": 0.0, + "text": "because optimization of the serial form has<00:29:08.200> gotten<00:29:08.480> sufficiently<00:29:09.120> good<00:29:09.400> that<00:29:09.520> the" + }, + { + "start": 1749.59, + "duration": 0.0, + "text": "has gotten sufficiently good that the" + }, + { + "start": 1749.6, + "duration": 0.0, + "text": "has gotten sufficiently good that the systems<00:29:10.080> gains<00:29:10.720> from<00:29:10.880> the<00:29:10.960> second<00:29:11.400> one<00:29:11.520> just" + }, + { + "start": 1751.75, + "duration": 0.0, + "text": "systems gains from the second one just" + }, + { + "start": 1751.76, + "duration": 0.0, + "text": "systems gains from the second one just isn't<00:29:12.120> worth<00:29:12.720> the<00:29:13.000> small<00:29:13.440> hits<00:29:14.159> to<00:29:14.640> uh" + }, + { + "start": 1754.71, + "duration": 0.0, + "text": "isn't worth the small hits to uh" + }, + { + "start": 1754.72, + "duration": 0.0, + "text": "isn't worth the small hits to uh representation<00:29:15.440> power<00:29:15.720> that<00:29:15.880> you<00:29:16.000> end<00:29:16.120> up" + }, + { + "start": 1756.23, + "duration": 0.0, + "text": "representation power that you end up" + }, + { + "start": 1756.24, + "duration": 0.0, + "text": "representation power that you end up getting<00:29:16.919> going<00:29:17.240> from<00:29:17.800> uh<00:29:18.159> parallel<00:29:18.480> to" + }, + { + "start": 1758.59, + "duration": 0.0, + "text": "getting going from uh parallel to" + }, + { + "start": 1758.6, + "duration": 0.0, + "text": "getting going from uh parallel to serial." + }, + { + "start": 1759.83, + "duration": 0.0, + "text": "serial." + }, + { + "start": 1759.84, + "duration": 0.0, + "text": "serial. Effectively,<00:29:20.440> you<00:29:20.520> can<00:29:20.640> think<00:29:20.800> about<00:29:20.960> it<00:29:21.080> as" + }, + { + "start": 1761.47, + "duration": 0.0, + "text": "Effectively, you can think about it as" + }, + { + "start": 1761.48, + "duration": 0.0, + "text": "Effectively, you can think about it as you've<00:29:21.640> lost<00:29:22.000> half<00:29:22.240> of<00:29:22.360> your<00:29:22.520> depth,<00:29:22.880> right?" + }, + { + "start": 1763.07, + "duration": 0.0, + "text": "you've lost half of your depth, right?" + }, + { + "start": 1763.08, + "duration": 0.0, + "text": "you've lost half of your depth, right? And<00:29:23.159> that<00:29:23.360> can<00:29:23.520> be" + }, + { + "start": 1764.43, + "duration": 0.0, + "text": "And that can be" + }, + { + "start": 1764.44, + "duration": 0.0, + "text": "And that can be uh<00:29:24.720> a<00:29:24.760> deleterious" + }, + { + "start": 1766.35, + "duration": 0.0, + "text": "uh a deleterious" + }, + { + "start": 1766.36, + "duration": 0.0, + "text": "uh a deleterious uh<00:29:26.679> thing<00:29:26.880> to<00:29:27.000> do<00:29:27.200> to<00:29:27.320> your<00:29:27.440> model." + }, + { + "start": 1769.27, + "duration": 0.0, + "text": "uh thing to do to your model." + }, + { + "start": 1769.28, + "duration": 0.0, + "text": "uh thing to do to your model. Okay." + }, + { + "start": 1770.51, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 1770.52, + "duration": 0.0, + "text": "Okay. So,<00:29:31.000> in<00:29:31.120> terms<00:29:31.480> of<00:29:31.600> the<00:29:31.760> architecture<00:29:32.320> things," + }, + { + "start": 1772.95, + "duration": 0.0, + "text": "So, in terms of the architecture things," + }, + { + "start": 1772.96, + "duration": 0.0, + "text": "So, in terms of the architecture things, actually,<00:29:33.560> you<00:29:33.640> know,<00:29:33.800> the<00:29:33.919> fact<00:29:34.120> that<00:29:34.240> this" + }, + { + "start": 1774.39, + "duration": 0.0, + "text": "actually, you know, the fact that this" + }, + { + "start": 1774.4, + "duration": 0.0, + "text": "actually, you know, the fact that this is<00:29:34.520> so<00:29:34.679> short<00:29:35.679> should<00:29:35.840> kind<00:29:36.000> of<00:29:36.080> suggest<00:29:36.560> to" + }, + { + "start": 1776.669, + "duration": 0.0, + "text": "is so short should kind of suggest to" + }, + { + "start": 1776.679, + "duration": 0.0, + "text": "is so short should kind of suggest to you<00:29:36.880> how<00:29:37.360> much<00:29:37.800> the<00:29:37.919> original<00:29:38.400> transformer" + }, + { + "start": 1779.35, + "duration": 0.0, + "text": "you how much the original transformer" + }, + { + "start": 1779.36, + "duration": 0.0, + "text": "you how much the original transformer formulation<00:29:40.000> has<00:29:40.480> somewhat<00:29:40.800> stood<00:29:41.080> the<00:29:41.200> test" + }, + { + "start": 1781.47, + "duration": 0.0, + "text": "formulation has somewhat stood the test" + }, + { + "start": 1781.48, + "duration": 0.0, + "text": "formulation has somewhat stood the test of<00:29:41.640> time,<00:29:42.000> right?<00:29:42.159> Cuz<00:29:42.280> the<00:29:42.400> only<00:29:42.600> thing<00:29:42.760> I'm" + }, + { + "start": 1782.87, + "duration": 0.0, + "text": "of time, right? Cuz the only thing I'm" + }, + { + "start": 1782.88, + "duration": 0.0, + "text": "of time, right? Cuz the only thing I'm really<00:29:43.120> talking<00:29:43.560> about<00:29:44.159> changing<00:29:44.600> here" + }, + { + "start": 1785.55, + "duration": 0.0, + "text": "really talking about changing here" + }, + { + "start": 1785.56, + "duration": 0.0, + "text": "really talking about changing here um<00:29:45.880> is,<00:29:46.080> you<00:29:46.159> know,<00:29:46.280> where<00:29:46.440> the<00:29:46.560> norms<00:29:47.000> go,<00:29:47.640> or" + }, + { + "start": 1787.99, + "duration": 0.0, + "text": "um is, you know, where the norms go, or" + }, + { + "start": 1788.0, + "duration": 0.0, + "text": "um is, you know, where the norms go, or you<00:29:48.080> know,<00:29:48.200> whether<00:29:48.480> we<00:29:48.760> have<00:29:49.000> bias<00:29:49.400> terms,<00:29:50.120> or" + }, + { + "start": 1790.27, + "duration": 0.0, + "text": "you know, whether we have bias terms, or" + }, + { + "start": 1790.28, + "duration": 0.0, + "text": "you know, whether we have bias terms, or whether<00:29:50.560> we<00:29:50.720> gate<00:29:51.040> the<00:29:51.160> MLPs,<00:29:51.720> but<00:29:51.840> those<00:29:52.000> are" + }, + { + "start": 1792.07, + "duration": 0.0, + "text": "whether we gate the MLPs, but those are" + }, + { + "start": 1792.08, + "duration": 0.0, + "text": "whether we gate the MLPs, but those are actually<00:29:52.320> pretty<00:29:52.760> minor<00:29:53.200> changes<00:29:54.000> compared" + }, + { + "start": 1794.35, + "duration": 0.0, + "text": "actually pretty minor changes compared" + }, + { + "start": 1794.36, + "duration": 0.0, + "text": "actually pretty minor changes compared to<00:29:54.480> all<00:29:54.600> the<00:29:54.680> things<00:29:54.960> that<00:29:55.080> you<00:29:55.200> can<00:29:55.320> do." + }, + { + "start": 1796.43, + "duration": 0.0, + "text": "to all the things that you can do." + }, + { + "start": 1796.44, + "duration": 0.0, + "text": "to all the things that you can do. Now,<00:29:57.240> uh,<00:29:57.520> those<00:29:57.760> of<00:29:57.880> you<00:29:57.960> that<00:29:58.120> are,<00:29:58.400> you" + }, + { + "start": 1798.47, + "duration": 0.0, + "text": "Now, uh, those of you that are, you" + }, + { + "start": 1798.48, + "duration": 0.0, + "text": "Now, uh, those of you that are, you know,<00:29:58.600> sort<00:29:58.800> of<00:29:58.880> carefully<00:29:59.240> paying<00:29:59.440> attention" + }, + { + "start": 1799.83, + "duration": 0.0, + "text": "know, sort of carefully paying attention" + }, + { + "start": 1799.84, + "duration": 0.0, + "text": "know, sort of carefully paying attention might<00:30:00.000> say,<00:30:00.200> but<00:30:00.400> wait,<00:30:01.040> you<00:30:01.160> know,<00:30:01.280> there's<00:30:01.480> a" + }, + { + "start": 1801.51, + "duration": 0.0, + "text": "might say, but wait, you know, there's a" + }, + { + "start": 1801.52, + "duration": 0.0, + "text": "might say, but wait, you know, there's a lot<00:30:01.800> of<00:30:01.880> transformer<00:30:02.400> alternatives<00:30:03.040> that" + }, + { + "start": 1803.19, + "duration": 0.0, + "text": "lot of transformer alternatives that" + }, + { + "start": 1803.2, + "duration": 0.0, + "text": "lot of transformer alternatives that change<00:30:03.480> the<00:30:03.560> attention.<00:30:04.520> Um,<00:30:04.840> yes,<00:30:05.240> you'll" + }, + { + "start": 1805.39, + "duration": 0.0, + "text": "change the attention. Um, yes, you'll" + }, + { + "start": 1805.4, + "duration": 0.0, + "text": "change the attention. Um, yes, you'll have<00:30:05.560> to<00:30:05.680> wait<00:30:05.880> until<00:30:06.200> next<00:30:06.560> lecture<00:30:07.320> because" + }, + { + "start": 1807.63, + "duration": 0.0, + "text": "have to wait until next lecture because" + }, + { + "start": 1807.64, + "duration": 0.0, + "text": "have to wait until next lecture because today<00:30:07.880> I'm<00:30:07.960> just<00:30:08.240> only<00:30:08.480> going<00:30:08.600> to<00:30:08.680> cover<00:30:09.040> sort" + }, + { + "start": 1809.23, + "duration": 0.0, + "text": "today I'm just only going to cover sort" + }, + { + "start": 1809.24, + "duration": 0.0, + "text": "today I'm just only going to cover sort of<00:30:09.640> core<00:30:10.040> attention<00:30:10.480> based<00:30:10.800> methods.<00:30:11.800> Um,<00:30:12.120> and" + }, + { + "start": 1812.39, + "duration": 0.0, + "text": "of core attention based methods. Um, and" + }, + { + "start": 1812.4, + "duration": 0.0, + "text": "of core attention based methods. Um, and next<00:30:12.640> lecture<00:30:12.960> I'll<00:30:13.120> throw<00:30:13.320> in<00:30:13.400> a<00:30:13.440> little<00:30:13.680> bit" + }, + { + "start": 1813.83, + "duration": 0.0, + "text": "next lecture I'll throw in a little bit" + }, + { + "start": 1813.84, + "duration": 0.0, + "text": "next lecture I'll throw in a little bit of<00:30:14.000> a<00:30:14.160> state<00:30:14.480> space<00:30:14.720> model<00:30:15.040> stuff,<00:30:15.440> but<00:30:16.160> as" + }, + { + "start": 1816.27, + "duration": 0.0, + "text": "of a state space model stuff, but as" + }, + { + "start": 1816.28, + "duration": 0.0, + "text": "of a state space model stuff, but as long<00:30:16.520> as<00:30:16.600> you're<00:30:16.720> in<00:30:16.800> this<00:30:16.960> like<00:30:17.120> dense" + }, + { + "start": 1817.39, + "duration": 0.0, + "text": "long as you're in this like dense" + }, + { + "start": 1817.4, + "duration": 0.0, + "text": "long as you're in this like dense attention<00:30:17.880> land,<00:30:18.640> actually<00:30:19.040> the" + }, + { + "start": 1819.15, + "duration": 0.0, + "text": "attention land, actually the" + }, + { + "start": 1819.16, + "duration": 0.0, + "text": "attention land, actually the architecture<00:30:19.640> from<00:30:19.840> the<00:30:19.960> original" + }, + { + "start": 1820.31, + "duration": 0.0, + "text": "architecture from the original" + }, + { + "start": 1820.32, + "duration": 0.0, + "text": "architecture from the original transformer<00:30:20.800> paper<00:30:21.040> is<00:30:21.200> pretty<00:30:21.600> close<00:30:22.360> to" + }, + { + "start": 1822.47, + "duration": 0.0, + "text": "transformer paper is pretty close to" + }, + { + "start": 1822.48, + "duration": 0.0, + "text": "transformer paper is pretty close to what<00:30:22.640> we<00:30:22.760> do." + }, + { + "start": 1823.91, + "duration": 0.0, + "text": "what we do." + }, + { + "start": 1823.92, + "duration": 0.0, + "text": "what we do. So,<00:30:24.080> you<00:30:24.200> see,<00:30:24.680> uh,<00:30:24.800> quite<00:30:25.040> a<00:30:25.080> bit<00:30:25.240> of<00:30:25.320> this," + }, + { + "start": 1825.55, + "duration": 0.0, + "text": "So, you see, uh, quite a bit of this," + }, + { + "start": 1825.56, + "duration": 0.0, + "text": "So, you see, uh, quite a bit of this, right?<00:30:25.800> So,<00:30:26.280> uh,<00:30:26.520> just<00:30:26.680> now<00:30:26.880> going<00:30:27.120> back<00:30:27.320> to" + }, + { + "start": 1827.39, + "duration": 0.0, + "text": "right? So, uh, just now going back to" + }, + { + "start": 1827.4, + "duration": 0.0, + "text": "right? So, uh, just now going back to this,<00:30:28.000> blue<00:30:28.280> here<00:30:28.520> is<00:30:28.640> RMS<00:30:29.040> norm<00:30:29.280> block<00:30:29.640> as" + }, + { + "start": 1829.75, + "duration": 0.0, + "text": "this, blue here is RMS norm block as" + }, + { + "start": 1829.76, + "duration": 0.0, + "text": "this, blue here is RMS norm block as layer<00:30:30.000> norm.<00:30:30.560> You<00:30:30.680> see<00:30:30.840> most<00:30:31.200> of<00:30:31.280> the<00:30:31.360> modern" + }, + { + "start": 1831.87, + "duration": 0.0, + "text": "layer norm. You see most of the modern" + }, + { + "start": 1831.88, + "duration": 0.0, + "text": "layer norm. You see most of the modern models<00:30:32.280> are<00:30:32.440> sort<00:30:32.600> of<00:30:32.720> RMS<00:30:33.120> norm<00:30:33.320> models." + }, + { + "start": 1834.07, + "duration": 0.0, + "text": "models are sort of RMS norm models." + }, + { + "start": 1834.08, + "duration": 0.0, + "text": "models are sort of RMS norm models. Serial<00:30:34.560> versus<00:30:34.880> parallel<00:30:35.280> layers,<00:30:35.600> the<00:30:35.680> blue" + }, + { + "start": 1835.83, + "duration": 0.0, + "text": "Serial versus parallel layers, the blue" + }, + { + "start": 1835.84, + "duration": 0.0, + "text": "Serial versus parallel layers, the blue one's<00:30:36.080> parallel,<00:30:36.400> the<00:30:36.520> rest<00:30:36.720> is<00:30:36.840> serial.<00:30:37.240> You" + }, + { + "start": 1837.31, + "duration": 0.0, + "text": "one's parallel, the rest is serial. You" + }, + { + "start": 1837.32, + "duration": 0.0, + "text": "one's parallel, the rest is serial. You see<00:30:37.480> mostly<00:30:38.040> serial<00:30:38.480> layers.<00:30:39.360> Um,<00:30:39.480> pre-norm" + }, + { + "start": 1840.07, + "duration": 0.0, + "text": "see mostly serial layers. Um, pre-norm" + }, + { + "start": 1840.08, + "duration": 0.0, + "text": "see mostly serial layers. Um, pre-norm versus<00:30:40.440> post-norm.<00:30:40.960> Some<00:30:41.160> of<00:30:41.240> these,<00:30:41.920> uh," + }, + { + "start": 1841.99, + "duration": 0.0, + "text": "versus post-norm. Some of these, uh," + }, + { + "start": 1842.0, + "duration": 0.0, + "text": "versus post-norm. Some of these, uh, ones<00:30:42.240> that<00:30:42.480> I<00:30:42.560> marked<00:30:42.880> as<00:30:43.040> post-norm<00:30:43.520> are" + }, + { + "start": 1843.63, + "duration": 0.0, + "text": "ones that I marked as post-norm are" + }, + { + "start": 1843.64, + "duration": 0.0, + "text": "ones that I marked as post-norm are actually<00:30:44.000> pre<00:30:44.400> and<00:30:44.600> post-norm." + }, + { + "start": 1845.79, + "duration": 0.0, + "text": "actually pre and post-norm." + }, + { + "start": 1845.8, + "duration": 0.0, + "text": "actually pre and post-norm. Um,<00:30:46.240> and<00:30:46.360> then<00:30:46.960> these<00:30:47.400> ones<00:30:47.640> on<00:30:47.720> the<00:30:47.800> right," + }, + { + "start": 1847.99, + "duration": 0.0, + "text": "Um, and then these ones on the right," + }, + { + "start": 1848.0, + "duration": 0.0, + "text": "Um, and then these ones on the right, these<00:30:48.120> are<00:30:48.240> GLUs,<00:30:49.400> uh,<00:30:49.520> almost<00:30:49.960> always<00:30:50.480> with" + }, + { + "start": 1850.59, + "duration": 0.0, + "text": "these are GLUs, uh, almost always with" + }, + { + "start": 1850.6, + "duration": 0.0, + "text": "these are GLUs, uh, almost always with the<00:30:50.680> exception<00:30:51.120> of<00:30:51.240> things<00:30:51.520> like,<00:30:51.920> uh," + }, + { + "start": 1852.03, + "duration": 0.0, + "text": "the exception of things like, uh," + }, + { + "start": 1852.04, + "duration": 0.0, + "text": "the exception of things like, uh, Falcon,<00:30:52.760> which<00:30:52.920> use<00:30:53.120> a<00:30:53.280> gated<00:30:53.640> linear<00:30:53.920> unit," + }, + { + "start": 1854.07, + "duration": 0.0, + "text": "Falcon, which use a gated linear unit," + }, + { + "start": 1854.08, + "duration": 0.0, + "text": "Falcon, which use a gated linear unit, but<00:30:54.240> almost<00:30:54.760> all<00:30:54.920> of<00:30:55.000> these<00:30:55.200> are<00:30:55.320> really,<00:30:55.960> uh," + }, + { + "start": 1856.03, + "duration": 0.0, + "text": "but almost all of these are really, uh," + }, + { + "start": 1856.04, + "duration": 0.0, + "text": "but almost all of these are really, uh, gated<00:30:56.360> linear<00:30:56.640> units<00:30:56.960> for<00:30:57.080> modern<00:30:57.400> models." + }, + { + "start": 1859.39, + "duration": 0.0, + "text": "gated linear units for modern models." + }, + { + "start": 1859.4, + "duration": 0.0, + "text": "gated linear units for modern models. So,<00:30:59.760> you<00:30:59.880> can<00:31:00.000> see<00:31:00.160> the<00:31:00.280> trends<00:31:00.600> quite" + }, + { + "start": 1860.83, + "duration": 0.0, + "text": "So, you can see the trends quite" + }, + { + "start": 1860.84, + "duration": 0.0, + "text": "So, you can see the trends quite visually,<00:31:01.880> um," + }, + { + "start": 1862.43, + "duration": 0.0, + "text": "visually, um," + }, + { + "start": 1862.44, + "duration": 0.0, + "text": "visually, um, from<00:31:02.600> what<00:31:02.720> I'm<00:31:02.800> telling<00:31:03.080> you." + }, + { + "start": 1864.15, + "duration": 0.0, + "text": "from what I'm telling you." + }, + { + "start": 1864.16, + "duration": 0.0, + "text": "from what I'm telling you. Okay.<00:31:04.800> So,<00:31:05.280> really<00:31:05.680> the<00:31:05.840> thing<00:31:06.080> that<00:31:06.360> is<00:31:07.040> very" + }, + { + "start": 1867.47, + "duration": 0.0, + "text": "Okay. So, really the thing that is very" + }, + { + "start": 1867.48, + "duration": 0.0, + "text": "Okay. So, really the thing that is very different<00:31:07.960> across<00:31:08.640> implementations,<00:31:09.440> and<00:31:09.560> I" + }, + { + "start": 1869.59, + "duration": 0.0, + "text": "different across implementations, and I" + }, + { + "start": 1869.6, + "duration": 0.0, + "text": "different across implementations, and I think<00:31:09.800> a<00:31:09.840> place<00:31:10.200> where<00:31:10.480> a<00:31:10.560> lot<00:31:10.800> of<00:31:10.840> the" + }, + { + "start": 1870.91, + "duration": 0.0, + "text": "think a place where a lot of the" + }, + { + "start": 1870.92, + "duration": 0.0, + "text": "think a place where a lot of the architecture<00:31:11.440> stuff<00:31:11.720> is<00:31:11.840> still<00:31:12.080> in<00:31:12.200> flux,<00:31:13.120> is" + }, + { + "start": 1873.43, + "duration": 0.0, + "text": "architecture stuff is still in flux, is" + }, + { + "start": 1873.44, + "duration": 0.0, + "text": "architecture stuff is still in flux, is how<00:31:13.640> you<00:31:13.840> do<00:31:14.480> kind<00:31:14.600> of<00:31:14.720> position<00:31:15.160> dependence" + }, + { + "start": 1876.15, + "duration": 0.0, + "text": "how you do kind of position dependence" + }, + { + "start": 1876.16, + "duration": 0.0, + "text": "how you do kind of position dependence and<00:31:16.280> incorporate<00:31:16.920> information<00:31:17.560> from<00:31:17.720> other" + }, + { + "start": 1877.91, + "duration": 0.0, + "text": "and incorporate information from other" + }, + { + "start": 1877.92, + "duration": 0.0, + "text": "and incorporate information from other positions,<00:31:18.480> right?<00:31:18.760> So,<00:31:18.880> the<00:31:19.000> core<00:31:19.400> attention" + }, + { + "start": 1879.91, + "duration": 0.0, + "text": "positions, right? So, the core attention" + }, + { + "start": 1879.92, + "duration": 0.0, + "text": "positions, right? So, the core attention component<00:31:20.440> in<00:31:20.560> some<00:31:20.720> sense." + }, + { + "start": 1881.91, + "duration": 0.0, + "text": "component in some sense." + }, + { + "start": 1881.92, + "duration": 0.0, + "text": "component in some sense. Um,<00:31:22.480> so<00:31:22.640> there<00:31:22.800> are<00:31:22.880> lots<00:31:23.360> of<00:31:23.480> different<00:31:23.840> ways" + }, + { + "start": 1884.63, + "duration": 0.0, + "text": "Um, so there are lots of different ways" + }, + { + "start": 1884.64, + "duration": 0.0, + "text": "Um, so there are lots of different ways that<00:31:24.800> you<00:31:24.920> can<00:31:25.040> encode<00:31:25.520> position<00:31:26.000> into<00:31:26.240> a" + }, + { + "start": 1886.27, + "duration": 0.0, + "text": "that you can encode position into a" + }, + { + "start": 1886.28, + "duration": 0.0, + "text": "that you can encode position into a transformer." + }, + { + "start": 1887.91, + "duration": 0.0, + "text": "transformer." + }, + { + "start": 1887.92, + "duration": 0.0, + "text": "transformer. And<00:31:28.200> just<00:31:28.400> so<00:31:28.520> you<00:31:28.640> know,<00:31:28.800> to<00:31:29.040> to<00:31:29.160> remind<00:31:29.560> you," + }, + { + "start": 1889.67, + "duration": 0.0, + "text": "And just so you know, to to remind you," + }, + { + "start": 1889.68, + "duration": 0.0, + "text": "And just so you know, to to remind you, right?<00:31:30.200> This<00:31:30.320> is<00:31:30.440> very,<00:31:30.680> very<00:31:30.960> important" + }, + { + "start": 1891.43, + "duration": 0.0, + "text": "right? This is very, very important" + }, + { + "start": 1891.44, + "duration": 0.0, + "text": "right? This is very, very important because<00:31:31.640> attention<00:31:32.480> is<00:31:32.640> positionally" + }, + { + "start": 1893.19, + "duration": 0.0, + "text": "because attention is positionally" + }, + { + "start": 1893.2, + "duration": 0.0, + "text": "because attention is positionally independent,<00:31:33.760> right?<00:31:33.920> They're<00:31:34.000> just<00:31:34.240> inner" + }, + { + "start": 1894.47, + "duration": 0.0, + "text": "independent, right? They're just inner" + }, + { + "start": 1894.48, + "duration": 0.0, + "text": "independent, right? They're just inner products,<00:31:35.400> so<00:31:35.520> you<00:31:35.560> can<00:31:35.680> just<00:31:35.840> shuffle<00:31:36.200> them" + }, + { + "start": 1896.39, + "duration": 0.0, + "text": "products, so you can just shuffle them" + }, + { + "start": 1896.4, + "duration": 0.0, + "text": "products, so you can just shuffle them and<00:31:36.520> attention<00:31:36.920> would<00:31:37.040> be<00:31:37.160> the<00:31:37.280> same<00:31:37.840> if<00:31:37.960> you" + }, + { + "start": 1898.07, + "duration": 0.0, + "text": "and attention would be the same if you" + }, + { + "start": 1898.08, + "duration": 0.0, + "text": "and attention would be the same if you don't<00:31:38.280> have<00:31:38.400> a<00:31:38.480> position<00:31:38.880> embedding." + }, + { + "start": 1899.99, + "duration": 0.0, + "text": "don't have a position embedding." + }, + { + "start": 1900.0, + "duration": 0.0, + "text": "don't have a position embedding. The<00:31:40.080> original<00:31:40.440> transformer<00:31:40.920> had<00:31:41.080> sine<00:31:41.320> and" + }, + { + "start": 1901.43, + "duration": 0.0, + "text": "The original transformer had sine and" + }, + { + "start": 1901.44, + "duration": 0.0, + "text": "The original transformer had sine and cosine<00:31:41.880> embeddings,<00:31:42.520> kind<00:31:42.760> of<00:31:42.840> like<00:31:43.000> a" + }, + { + "start": 1903.07, + "duration": 0.0, + "text": "cosine embeddings, kind of like a" + }, + { + "start": 1903.08, + "duration": 0.0, + "text": "cosine embeddings, kind of like a Fourier<00:31:43.560> transform<00:31:44.080> intuition<00:31:44.600> that<00:31:44.720> if<00:31:44.800> you" + }, + { + "start": 1904.87, + "duration": 0.0, + "text": "Fourier transform intuition that if you" + }, + { + "start": 1904.88, + "duration": 0.0, + "text": "Fourier transform intuition that if you have<00:31:45.000> sines<00:31:45.320> and<00:31:45.440> cosines,<00:31:46.360> then<00:31:46.480> you<00:31:46.560> can" + }, + { + "start": 1906.63, + "duration": 0.0, + "text": "have sines and cosines, then you can" + }, + { + "start": 1906.64, + "duration": 0.0, + "text": "have sines and cosines, then you can kind<00:31:46.760> of<00:31:46.880> recover<00:31:47.400> position<00:31:47.800> from<00:31:47.960> that<00:31:48.160> no" + }, + { + "start": 1908.23, + "duration": 0.0, + "text": "kind of recover position from that no" + }, + { + "start": 1908.24, + "duration": 0.0, + "text": "kind of recover position from that no matter<00:31:48.520> what." + }, + { + "start": 1909.51, + "duration": 0.0, + "text": "matter what." + }, + { + "start": 1909.52, + "duration": 0.0, + "text": "matter what. Um,<00:31:50.040> a<00:31:50.120> number<00:31:50.440> of<00:31:50.600> other<00:31:51.000> sort<00:31:51.160> of<00:31:51.320> large" + }, + { + "start": 1911.59, + "duration": 0.0, + "text": "Um, a number of other sort of large" + }, + { + "start": 1911.6, + "duration": 0.0, + "text": "Um, a number of other sort of large models<00:31:51.960> that,<00:31:52.480> you<00:31:52.560> know,<00:31:52.680> followed<00:31:53.320> soon" + }, + { + "start": 1913.63, + "duration": 0.0, + "text": "models that, you know, followed soon" + }, + { + "start": 1913.64, + "duration": 0.0, + "text": "models that, you know, followed soon after<00:31:53.920> that<00:31:54.080> used<00:31:54.360> absolute<00:31:54.920> embeddings," + }, + { + "start": 1915.87, + "duration": 0.0, + "text": "after that used absolute embeddings," + }, + { + "start": 1915.88, + "duration": 0.0, + "text": "after that used absolute embeddings, where<00:31:56.320> each<00:31:56.560> position<00:31:57.160> had<00:31:57.360> its<00:31:57.640> own" + }, + { + "start": 1917.87, + "duration": 0.0, + "text": "where each position had its own" + }, + { + "start": 1917.88, + "duration": 0.0, + "text": "where each position had its own different<00:31:58.240> embedding." + }, + { + "start": 1919.31, + "duration": 0.0, + "text": "different embedding." + }, + { + "start": 1919.32, + "duration": 0.0, + "text": "different embedding. Um,<00:32:00.160> and<00:32:00.360> then,<00:32:00.880> uh,<00:32:01.000> several<00:32:01.440> other<00:32:01.840> sort<00:32:02.040> of" + }, + { + "start": 1922.19, + "duration": 0.0, + "text": "Um, and then, uh, several other sort of" + }, + { + "start": 1922.2, + "duration": 0.0, + "text": "Um, and then, uh, several other sort of Google<00:32:02.600> models<00:32:03.080> like<00:32:03.360> to<00:32:03.520> use<00:32:03.840> relative" + }, + { + "start": 1924.31, + "duration": 0.0, + "text": "Google models like to use relative" + }, + { + "start": 1924.32, + "duration": 0.0, + "text": "Google models like to use relative embedding.<00:32:04.880> So,<00:32:04.960> in<00:32:05.040> here<00:32:05.320> you're<00:32:05.440> not" + }, + { + "start": 1925.75, + "duration": 0.0, + "text": "embedding. So, in here you're not" + }, + { + "start": 1925.76, + "duration": 0.0, + "text": "embedding. So, in here you're not adding,<00:32:06.480> um,<00:32:06.760> embeddings<00:32:07.280> into<00:32:07.480> the<00:32:07.720> into<00:32:07.960> the" + }, + { + "start": 1928.07, + "duration": 0.0, + "text": "adding, um, embeddings into the into the" + }, + { + "start": 1928.08, + "duration": 0.0, + "text": "adding, um, embeddings into the into the embedding,<00:32:08.840> uh,<00:32:09.320> like<00:32:09.520> word<00:32:09.720> vector" + }, + { + "start": 1929.99, + "duration": 0.0, + "text": "embedding, uh, like word vector" + }, + { + "start": 1930.0, + "duration": 0.0, + "text": "embedding, uh, like word vector embeddings,<00:32:10.720> but<00:32:10.840> instead<00:32:11.200> you're<00:32:11.440> adding<00:32:12.000> a" + }, + { + "start": 1932.11, + "duration": 0.0, + "text": "embeddings, but instead you're adding a" + }, + { + "start": 1932.12, + "duration": 0.0, + "text": "embeddings, but instead you're adding a vector<00:32:12.480> to<00:32:12.600> the<00:32:12.679> attention<00:32:13.200> computation" + }, + { + "start": 1933.75, + "duration": 0.0, + "text": "vector to the attention computation" + }, + { + "start": 1933.76, + "duration": 0.0, + "text": "vector to the attention computation itself,<00:32:14.240> right?<00:32:14.400> So,<00:32:14.480> if<00:32:14.600> you're<00:32:14.760> three" + }, + { + "start": 1935.03, + "duration": 0.0, + "text": "itself, right? So, if you're three" + }, + { + "start": 1935.04, + "duration": 0.0, + "text": "itself, right? So, if you're three positions<00:32:15.520> off,<00:32:16.040> sort<00:32:16.200> of<00:32:16.280> the<00:32:16.360> attention" + }, + { + "start": 1936.79, + "duration": 0.0, + "text": "positions off, sort of the attention" + }, + { + "start": 1936.8, + "duration": 0.0, + "text": "positions off, sort of the attention matrix<00:32:17.640> gets<00:32:17.800> a<00:32:17.840> different<00:32:18.240> offset<00:32:18.760> added<00:32:19.040> to" + }, + { + "start": 1939.15, + "duration": 0.0, + "text": "matrix gets a different offset added to" + }, + { + "start": 1939.16, + "duration": 0.0, + "text": "matrix gets a different offset added to it.<00:32:19.400> And<00:32:19.640> and,<00:32:20.080> you<00:32:20.160> know,<00:32:20.280> models<00:32:20.560> like<00:32:20.720> T5" + }, + { + "start": 1941.19, + "duration": 0.0, + "text": "it. And and, you know, models like T5" + }, + { + "start": 1941.2, + "duration": 0.0, + "text": "it. And and, you know, models like T5 and<00:32:21.320> Chinchilla<00:32:21.720> use<00:32:21.920> kind<00:32:22.080> of<00:32:22.120> this<00:32:22.280> scheme." + }, + { + "start": 1943.59, + "duration": 0.0, + "text": "and Chinchilla use kind of this scheme." + }, + { + "start": 1943.6, + "duration": 0.0, + "text": "and Chinchilla use kind of this scheme. Um," + }, + { + "start": 1944.79, + "duration": 0.0, + "text": "Um," + }, + { + "start": 1944.8, + "duration": 0.0, + "text": "Um, the<00:32:25.040> thing<00:32:25.280> that<00:32:25.400> has<00:32:25.600> really<00:32:26.000> become<00:32:26.520> pretty" + }, + { + "start": 1947.03, + "duration": 0.0, + "text": "the thing that has really become pretty" + }, + { + "start": 1947.04, + "duration": 0.0, + "text": "the thing that has really become pretty dominant<00:32:27.720> in<00:32:27.800> terms<00:32:28.080> of<00:32:28.160> position<00:32:28.560> embedding" + }, + { + "start": 1949.35, + "duration": 0.0, + "text": "dominant in terms of position embedding" + }, + { + "start": 1949.36, + "duration": 0.0, + "text": "dominant in terms of position embedding is<00:32:29.720> this<00:32:30.360> class<00:32:30.679> of<00:32:30.760> embeddings<00:32:31.200> called<00:32:31.520> rope," + }, + { + "start": 1952.11, + "duration": 0.0, + "text": "is this class of embeddings called rope," + }, + { + "start": 1952.12, + "duration": 0.0, + "text": "is this class of embeddings called rope, which<00:32:32.360> some<00:32:32.520> of<00:32:32.640> you<00:32:32.760> may<00:32:32.880> be<00:32:33.000> familiar<00:32:33.440> with." + }, + { + "start": 1954.03, + "duration": 0.0, + "text": "which some of you may be familiar with." + }, + { + "start": 1954.04, + "duration": 0.0, + "text": "which some of you may be familiar with. Um,<00:32:34.440> most<00:32:34.760> models<00:32:35.120> past<00:32:35.440> 2024<00:32:36.160> use<00:32:36.440> this<00:32:36.640> type" + }, + { + "start": 1956.87, + "duration": 0.0, + "text": "Um, most models past 2024 use this type" + }, + { + "start": 1956.88, + "duration": 0.0, + "text": "Um, most models past 2024 use this type of<00:32:37.000> embedding.<00:32:37.440> And<00:32:37.560> it's<00:32:37.640> kind<00:32:37.840> of" + }, + { + "start": 1957.87, + "duration": 0.0, + "text": "of embedding. And it's kind of" + }, + { + "start": 1957.88, + "duration": 0.0, + "text": "of embedding. And it's kind of remarkable<00:32:38.480> given<00:32:38.720> that<00:32:39.000> rope,<00:32:39.840> you<00:32:39.960> know,<00:32:40.160> in" + }, + { + "start": 1960.27, + "duration": 0.0, + "text": "remarkable given that rope, you know, in" + }, + { + "start": 1960.28, + "duration": 0.0, + "text": "remarkable given that rope, you know, in some<00:32:40.520> ways<00:32:40.720> came<00:32:40.960> out<00:32:41.080> of<00:32:41.200> nowhere." + }, + { + "start": 1962.31, + "duration": 0.0, + "text": "some ways came out of nowhere." + }, + { + "start": 1962.32, + "duration": 0.0, + "text": "some ways came out of nowhere. Um,<00:32:42.679> originally<00:32:43.240> I<00:32:43.320> think<00:32:43.560> this<00:32:43.720> was<00:32:43.840> also<00:32:44.080> a" + }, + { + "start": 1964.15, + "duration": 0.0, + "text": "Um, originally I think this was also a" + }, + { + "start": 1964.16, + "duration": 0.0, + "text": "Um, originally I think this was also a GPT-J<00:32:45.000> innovation,<00:32:46.040> um," + }, + { + "start": 1966.669, + "duration": 0.0, + "text": "GPT-J innovation, um," + }, + { + "start": 1966.679, + "duration": 0.0, + "text": "GPT-J innovation, um, from<00:32:47.400> I<00:32:47.480> think,<00:32:47.720> uh,<00:32:48.160> sort<00:32:48.360> of<00:32:48.520> not<00:32:48.800> very<00:32:49.040> well" + }, + { + "start": 1969.27, + "duration": 0.0, + "text": "from I think, uh, sort of not very well" + }, + { + "start": 1969.28, + "duration": 0.0, + "text": "from I think, uh, sort of not very well known<00:32:49.840> sort<00:32:49.960> of<00:32:50.080> blog<00:32:50.520> post<00:32:51.120> and,<00:32:51.560> uh,<00:32:51.640> paper" + }, + { + "start": 1971.95, + "duration": 0.0, + "text": "known sort of blog post and, uh, paper" + }, + { + "start": 1971.96, + "duration": 0.0, + "text": "known sort of blog post and, uh, paper combination,<00:32:53.000> uh,<00:32:53.080> from<00:32:53.280> an<00:32:53.400> author<00:32:53.640> in" + }, + { + "start": 1973.71, + "duration": 0.0, + "text": "combination, uh, from an author in" + }, + { + "start": 1973.72, + "duration": 0.0, + "text": "combination, uh, from an author in China.<00:32:54.679> Um,<00:32:55.480> but<00:32:55.679> really<00:32:56.240> it<00:32:56.360> has<00:32:56.520> some<00:32:56.679> really" + }, + { + "start": 1976.87, + "duration": 0.0, + "text": "China. Um, but really it has some really" + }, + { + "start": 1976.88, + "duration": 0.0, + "text": "China. Um, but really it has some really interesting<00:32:57.360> ideas<00:32:58.160> for<00:32:58.520> for<00:32:58.679> why<00:32:59.040> you<00:32:59.160> would" + }, + { + "start": 1979.31, + "duration": 0.0, + "text": "interesting ideas for for why you would" + }, + { + "start": 1979.32, + "duration": 0.0, + "text": "interesting ideas for for why you would do<00:32:59.440> something<00:32:59.760> like<00:33:00.000> rope." + }, + { + "start": 1981.27, + "duration": 0.0, + "text": "do something like rope." + }, + { + "start": 1981.28, + "duration": 0.0, + "text": "do something like rope. So,<00:33:01.440> rope,<00:33:02.160> you<00:33:02.240> know,<00:33:02.360> is<00:33:02.480> a<00:33:02.560> relative" + }, + { + "start": 1983.15, + "duration": 0.0, + "text": "So, rope, you know, is a relative" + }, + { + "start": 1983.16, + "duration": 0.0, + "text": "So, rope, you know, is a relative position<00:33:03.560> embedding.<00:33:04.520> And<00:33:04.800> a<00:33:04.880> relative" + }, + { + "start": 1985.27, + "duration": 0.0, + "text": "position embedding. And a relative" + }, + { + "start": 1985.28, + "duration": 0.0, + "text": "position embedding. And a relative position<00:33:05.679> embedding,<00:33:06.080> let's<00:33:06.840> make<00:33:07.000> an" + }, + { + "start": 1987.07, + "duration": 0.0, + "text": "position embedding, let's make an" + }, + { + "start": 1987.08, + "duration": 0.0, + "text": "position embedding, let's make an opinionated<00:33:07.800> stance<00:33:08.760> that<00:33:09.400> I<00:33:09.520> should<00:33:09.840> not" + }, + { + "start": 1990.11, + "duration": 0.0, + "text": "opinionated stance that I should not" + }, + { + "start": 1990.12, + "duration": 0.0, + "text": "opinionated stance that I should not care<00:33:10.760> about<00:33:11.080> the<00:33:11.280> absolute<00:33:11.800> position<00:33:12.440> of<00:33:12.640> any" + }, + { + "start": 1992.87, + "duration": 0.0, + "text": "care about the absolute position of any" + }, + { + "start": 1992.88, + "duration": 0.0, + "text": "care about the absolute position of any words.<00:33:13.480> So,<00:33:13.640> if,<00:33:13.920> you<00:33:13.960> know,<00:33:14.240> A<00:33:15.040> uh,<00:33:15.320> an<00:33:15.560> apple" + }, + { + "start": 1995.91, + "duration": 0.0, + "text": "words. So, if, you know, A uh, an apple" + }, + { + "start": 1995.92, + "duration": 0.0, + "text": "words. So, if, you know, A uh, an apple appear<00:33:16.200> together,<00:33:17.080> even<00:33:17.320> if<00:33:17.400> it<00:33:17.480> appears<00:33:17.840> at" + }, + { + "start": 1997.91, + "duration": 0.0, + "text": "appear together, even if it appears at" + }, + { + "start": 1997.92, + "duration": 0.0, + "text": "appear together, even if it appears at the<00:33:18.040> start<00:33:18.600> or<00:33:18.720> at<00:33:18.800> the<00:33:19.040> end,<00:33:19.679> right?<00:33:19.880> In<00:33:20.040> rope" + }, + { + "start": 2000.27, + "duration": 0.0, + "text": "the start or at the end, right? In rope" + }, + { + "start": 2000.28, + "duration": 0.0, + "text": "the start or at the end, right? In rope embeddings,<00:33:21.080> they<00:33:21.200> should<00:33:21.360> kind<00:33:21.520> of<00:33:21.600> get<00:33:21.720> the" + }, + { + "start": 2001.83, + "duration": 0.0, + "text": "embeddings, they should kind of get the" + }, + { + "start": 2001.84, + "duration": 0.0, + "text": "embeddings, they should kind of get the same,<00:33:22.679> uh,<00:33:22.800> sort<00:33:23.000> of<00:33:23.080> result.<00:33:24.200> Um,<00:33:24.960> and<00:33:25.200> we<00:33:25.360> do" + }, + { + "start": 2005.59, + "duration": 0.0, + "text": "same, uh, sort of result. Um, and we do" + }, + { + "start": 2005.6, + "duration": 0.0, + "text": "same, uh, sort of result. Um, and we do know<00:33:26.520> that,<00:33:27.240> you<00:33:27.320> know,<00:33:27.840> or<00:33:28.400> and<00:33:28.520> we<00:33:28.640> want<00:33:28.840> to" + }, + { + "start": 2008.87, + "duration": 0.0, + "text": "know that, you know, or and we want to" + }, + { + "start": 2008.88, + "duration": 0.0, + "text": "know that, you know, or and we want to sort<00:33:29.040> of<00:33:29.480> represent<00:33:29.920> it<00:33:29.960> in<00:33:30.080> this<00:33:30.320> way,<00:33:30.480> right?" + }, + { + "start": 2010.669, + "duration": 0.0, + "text": "sort of represent it in this way, right?" + }, + { + "start": 2010.679, + "duration": 0.0, + "text": "sort of represent it in this way, right? So,<00:33:31.240> I<00:33:31.320> have<00:33:31.520> an<00:33:31.600> embedding<00:33:32.080> F,<00:33:32.720> and<00:33:32.840> I<00:33:32.920> have" + }, + { + "start": 2013.23, + "duration": 0.0, + "text": "So, I have an embedding F, and I have" + }, + { + "start": 2013.24, + "duration": 0.0, + "text": "So, I have an embedding F, and I have another<00:33:33.520> embedding<00:33:33.920> F,<00:33:34.320> and<00:33:34.480> these<00:33:34.679> are<00:33:34.880> going" + }, + { + "start": 2015.03, + "duration": 0.0, + "text": "another embedding F, and these are going" + }, + { + "start": 2015.04, + "duration": 0.0, + "text": "another embedding F, and these are going to<00:33:35.120> take<00:33:35.360> in<00:33:35.440> the<00:33:35.560> identity<00:33:36.040> of<00:33:36.160> the<00:33:36.240> words<00:33:36.560> X" + }, + { + "start": 2016.75, + "duration": 0.0, + "text": "to take in the identity of the words X" + }, + { + "start": 2016.76, + "duration": 0.0, + "text": "to take in the identity of the words X and<00:33:36.920> Y<00:33:37.240> and<00:33:37.400> the<00:33:37.560> positions<00:33:38.240> absolute<00:33:38.800> of<00:33:39.000> I" + }, + { + "start": 2019.19, + "duration": 0.0, + "text": "and Y and the positions absolute of I" + }, + { + "start": 2019.2, + "duration": 0.0, + "text": "and Y and the positions absolute of I and<00:33:39.400> J." + }, + { + "start": 2020.31, + "duration": 0.0, + "text": "and J." + }, + { + "start": 2020.32, + "duration": 0.0, + "text": "and J. And<00:33:40.440> I<00:33:40.520> want<00:33:40.800> this<00:33:40.960> to<00:33:41.040> be<00:33:41.240> equal<00:33:41.720> if<00:33:41.880> I<00:33:41.920> take" + }, + { + "start": 2022.11, + "duration": 0.0, + "text": "And I want this to be equal if I take" + }, + { + "start": 2022.12, + "duration": 0.0, + "text": "And I want this to be equal if I take the<00:33:42.280> inner<00:33:42.480> product<00:33:43.000> of<00:33:43.080> these<00:33:43.280> embeddings<00:33:43.720> to" + }, + { + "start": 2023.79, + "duration": 0.0, + "text": "the inner product of these embeddings to" + }, + { + "start": 2023.8, + "duration": 0.0, + "text": "the inner product of these embeddings to be<00:33:44.000> equal<00:33:44.880> to<00:33:45.560> a<00:33:45.840> function<00:33:46.480> that<00:33:46.760> only<00:33:47.120> depends" + }, + { + "start": 2027.51, + "duration": 0.0, + "text": "be equal to a function that only depends" + }, + { + "start": 2027.52, + "duration": 0.0, + "text": "be equal to a function that only depends on<00:33:47.640> the<00:33:47.720> relative<00:33:48.120> difference." + }, + { + "start": 2029.71, + "duration": 0.0, + "text": "on the relative difference." + }, + { + "start": 2029.72, + "duration": 0.0, + "text": "on the relative difference. Right?<00:33:50.360> Um,<00:33:50.520> and<00:33:50.600> every<00:33:51.000> existing<00:33:51.560> embedding" + }, + { + "start": 2031.87, + "duration": 0.0, + "text": "Right? Um, and every existing embedding" + }, + { + "start": 2031.88, + "duration": 0.0, + "text": "Right? Um, and every existing embedding before<00:33:52.280> it<00:33:52.400> didn't<00:33:52.640> really<00:33:52.840> fulfill<00:33:53.280> this" + }, + { + "start": 2033.43, + "duration": 0.0, + "text": "before it didn't really fulfill this" + }, + { + "start": 2033.44, + "duration": 0.0, + "text": "before it didn't really fulfill this equality.<00:33:54.000> Like<00:33:54.240> sine<00:33:55.120> is<00:33:55.280> not<00:33:55.520> relative" + }, + { + "start": 2035.95, + "duration": 0.0, + "text": "equality. Like sine is not relative" + }, + { + "start": 2035.96, + "duration": 0.0, + "text": "equality. Like sine is not relative because<00:33:56.160> it<00:33:56.240> has<00:33:56.440> these<00:33:56.679> absolute<00:33:57.280> cross" + }, + { + "start": 2037.669, + "duration": 0.0, + "text": "because it has these absolute cross" + }, + { + "start": 2037.679, + "duration": 0.0, + "text": "because it has these absolute cross terms<00:33:58.000> that<00:33:58.120> are<00:33:58.200> not<00:33:58.520> relative.<00:33:59.280> Absolute" + }, + { + "start": 2039.75, + "duration": 0.0, + "text": "terms that are not relative. Absolute" + }, + { + "start": 2039.76, + "duration": 0.0, + "text": "terms that are not relative. Absolute position<00:34:00.120> embeddings,<00:34:00.600> just<00:34:00.800> by<00:34:00.920> the<00:34:01.120> the" + }, + { + "start": 2041.31, + "duration": 0.0, + "text": "position embeddings, just by the the" + }, + { + "start": 2041.32, + "duration": 0.0, + "text": "position embeddings, just by the the name<00:34:01.600> of<00:34:01.720> it,<00:34:01.880> is<00:34:02.000> obviously<00:34:02.520> not<00:34:02.800> relative." + }, + { + "start": 2043.83, + "duration": 0.0, + "text": "name of it, is obviously not relative." + }, + { + "start": 2043.84, + "duration": 0.0, + "text": "name of it, is obviously not relative. And<00:34:03.960> then<00:34:04.400> relative<00:34:04.880> embeddings," + }, + { + "start": 2045.79, + "duration": 0.0, + "text": "And then relative embeddings," + }, + { + "start": 2045.8, + "duration": 0.0, + "text": "And then relative embeddings, technically<00:34:06.360> these<00:34:06.560> are<00:34:06.720> relative,<00:34:07.240> but" + }, + { + "start": 2047.35, + "duration": 0.0, + "text": "technically these are relative, but" + }, + { + "start": 2047.36, + "duration": 0.0, + "text": "technically these are relative, but they're<00:34:07.480> not<00:34:07.760> kind<00:34:08.040> of<00:34:08.200> embeddings<00:34:08.879> because" + }, + { + "start": 2049.19, + "duration": 0.0, + "text": "they're not kind of embeddings because" + }, + { + "start": 2049.2, + "duration": 0.0, + "text": "they're not kind of embeddings because they're<00:34:09.320> just<00:34:09.600> adding<00:34:09.960> to<00:34:10.080> the<00:34:10.200> attention" + }, + { + "start": 2050.59, + "duration": 0.0, + "text": "they're just adding to the attention" + }, + { + "start": 2050.6, + "duration": 0.0, + "text": "they're just adding to the attention matrix,<00:34:11.080> right?<00:34:11.280> So,<00:34:11.359> there's<00:34:11.520> no<00:34:11.679> inner" + }, + { + "start": 2051.909, + "duration": 0.0, + "text": "matrix, right? So, there's no inner" + }, + { + "start": 2051.919, + "duration": 0.0, + "text": "matrix, right? So, there's no inner product<00:34:12.359> structure<00:34:13.240> that,<00:34:13.640> you<00:34:13.720> know,<00:34:13.840> you" + }, + { + "start": 2053.909, + "duration": 0.0, + "text": "product structure that, you know, you" + }, + { + "start": 2053.919, + "duration": 0.0, + "text": "product structure that, you know, you can<00:34:14.040> extract<00:34:14.600> out<00:34:14.720> of<00:34:14.800> the" + }, + { + "start": 2056.03, + "duration": 0.0, + "text": "can extract out of the" + }, + { + "start": 2056.04, + "duration": 0.0, + "text": "can extract out of the So,<00:34:16.359> given<00:34:16.879> this,<00:34:17.480> you<00:34:17.560> might<00:34:17.800> ask,<00:34:18.359> is<00:34:18.600> there" + }, + { + "start": 2058.869, + "duration": 0.0, + "text": "So, given this, you might ask, is there" + }, + { + "start": 2058.879, + "duration": 0.0, + "text": "So, given this, you might ask, is there a<00:34:18.960> nice<00:34:19.399> way<00:34:19.919> that<00:34:20.159> we<00:34:20.280> can<00:34:20.640> truly<00:34:21.159> have<00:34:21.520> this" + }, + { + "start": 2061.75, + "duration": 0.0, + "text": "a nice way that we can truly have this" + }, + { + "start": 2061.76, + "duration": 0.0, + "text": "a nice way that we can truly have this relative<00:34:22.159> embedding?" + }, + { + "start": 2063.669, + "duration": 0.0, + "text": "relative embedding?" + }, + { + "start": 2063.679, + "duration": 0.0, + "text": "relative embedding? And<00:34:23.800> the<00:34:23.919> idea<00:34:24.200> is<00:34:24.320> very<00:34:24.600> cool.<00:34:25.280> Um,<00:34:25.399> it's" + }, + { + "start": 2065.55, + "duration": 0.0, + "text": "And the idea is very cool. Um, it's" + }, + { + "start": 2065.56, + "duration": 0.0, + "text": "And the idea is very cool. Um, it's really<00:34:25.800> just<00:34:26.040> looking<00:34:26.320> at<00:34:26.399> kind<00:34:26.560> of<00:34:27.240> uh," + }, + { + "start": 2067.95, + "duration": 0.0, + "text": "really just looking at kind of uh," + }, + { + "start": 2067.96, + "duration": 0.0, + "text": "really just looking at kind of uh, properties<00:34:28.480> about<00:34:29.159> angles<00:34:29.600> and<00:34:29.720> cosines.<00:34:30.840> So," + }, + { + "start": 2071.59, + "duration": 0.0, + "text": "properties about angles and cosines. So," + }, + { + "start": 2071.6, + "duration": 0.0, + "text": "properties about angles and cosines. So, we<00:34:31.760> want<00:34:31.960> our<00:34:32.080> embeddings<00:34:32.560> to<00:34:32.640> be<00:34:32.760> invariant" + }, + { + "start": 2073.389, + "duration": 0.0, + "text": "we want our embeddings to be invariant" + }, + { + "start": 2073.399, + "duration": 0.0, + "text": "we want our embeddings to be invariant to<00:34:33.480> absolute<00:34:33.879> positions,<00:34:34.560> and<00:34:34.679> we<00:34:34.800> know<00:34:34.960> that" + }, + { + "start": 2075.11, + "duration": 0.0, + "text": "to absolute positions, and we know that" + }, + { + "start": 2075.12, + "duration": 0.0, + "text": "to absolute positions, and we know that inner<00:34:35.320> products<00:34:36.240> of<00:34:36.440> any<00:34:36.640> kind<00:34:36.960> are<00:34:37.040> invariant" + }, + { + "start": 2077.71, + "duration": 0.0, + "text": "inner products of any kind are invariant" + }, + { + "start": 2077.72, + "duration": 0.0, + "text": "inner products of any kind are invariant to<00:34:37.800> arbitrary<00:34:38.200> rotation,<00:34:38.879> right?<00:34:39.600> So,<00:34:40.159> the" + }, + { + "start": 2080.31, + "duration": 0.0, + "text": "to arbitrary rotation, right? So, the" + }, + { + "start": 2080.32, + "duration": 0.0, + "text": "to arbitrary rotation, right? So, the idea<00:34:40.720> is<00:34:40.840> to<00:34:40.960> say,<00:34:41.600> I'm<00:34:41.720> going<00:34:41.879> to<00:34:42.000> take<00:34:42.399> my" + }, + { + "start": 2082.63, + "duration": 0.0, + "text": "idea is to say, I'm going to take my" + }, + { + "start": 2082.64, + "duration": 0.0, + "text": "idea is to say, I'm going to take my semantic<00:34:43.760> word<00:34:44.000> vectors,<00:34:44.399> the<00:34:44.480> ones<00:34:44.720> that<00:34:44.879> are" + }, + { + "start": 2085.03, + "duration": 0.0, + "text": "semantic word vectors, the ones that are" + }, + { + "start": 2085.04, + "duration": 0.0, + "text": "semantic word vectors, the ones that are are<00:34:45.240> independent<00:34:46.000> of<00:34:46.159> any<00:34:46.359> position.<00:34:46.879> So," + }, + { + "start": 2086.99, + "duration": 0.0, + "text": "are independent of any position. So," + }, + { + "start": 2087.0, + "duration": 0.0, + "text": "are independent of any position. So, this<00:34:47.200> is<00:34:47.240> my<00:34:47.359> starting<00:34:47.720> point.<00:34:48.600> And<00:34:48.800> then<00:34:49.200> I'm" + }, + { + "start": 2089.27, + "duration": 0.0, + "text": "this is my starting point. And then I'm" + }, + { + "start": 2089.28, + "duration": 0.0, + "text": "this is my starting point. And then I'm going<00:34:49.399> to<00:34:49.600> rotate<00:34:50.240> each<00:34:50.399> of<00:34:50.480> these<00:34:50.640> vectors," + }, + { + "start": 2091.23, + "duration": 0.0, + "text": "going to rotate each of these vectors," + }, + { + "start": 2091.24, + "duration": 0.0, + "text": "going to rotate each of these vectors, in<00:34:51.359> this<00:34:51.480> case<00:34:51.720> in<00:34:51.879> 2D,<00:34:52.919> um,<00:34:53.760> based<00:34:54.200> on<00:34:54.560> the" + }, + { + "start": 2094.669, + "duration": 0.0, + "text": "in this case in 2D, um, based on the" + }, + { + "start": 2094.679, + "duration": 0.0, + "text": "in this case in 2D, um, based on the position<00:34:55.159> that<00:34:55.320> the<00:34:55.440> words<00:34:55.679> appear.<00:34:56.520> So,<00:34:57.400> you" + }, + { + "start": 2097.47, + "duration": 0.0, + "text": "position that the words appear. So, you" + }, + { + "start": 2097.48, + "duration": 0.0, + "text": "position that the words appear. So, you know,<00:34:57.600> just<00:34:57.880> as<00:34:58.040> a" + }, + { + "start": 2099.15, + "duration": 0.0, + "text": "know, just as a" + }, + { + "start": 2099.16, + "duration": 0.0, + "text": "know, just as a uh,<00:34:59.440> simple<00:34:59.760> example,<00:35:00.680> we,<00:35:01.120> let's<00:35:01.320> say<00:35:01.480> we" + }, + { + "start": 2101.63, + "duration": 0.0, + "text": "uh, simple example, we, let's say we" + }, + { + "start": 2101.64, + "duration": 0.0, + "text": "uh, simple example, we, let's say we have<00:35:01.800> the<00:35:02.120> uh,<00:35:02.480> sentence,<00:35:03.000> we<00:35:03.240> know<00:35:03.480> that," + }, + { + "start": 2104.11, + "duration": 0.0, + "text": "have the uh, sentence, we know that," + }, + { + "start": 2104.12, + "duration": 0.0, + "text": "have the uh, sentence, we know that, right?<00:35:04.640> We<00:35:04.880> appear<00:35:05.200> at<00:35:05.320> position<00:35:05.760> zero,<00:35:06.600> so" + }, + { + "start": 2106.71, + "duration": 0.0, + "text": "right? We appear at position zero, so" + }, + { + "start": 2106.72, + "duration": 0.0, + "text": "right? We appear at position zero, so I'm<00:35:06.800> not<00:35:07.000> going<00:35:07.120> to<00:35:07.160> touch<00:35:07.359> that<00:35:07.520> at<00:35:07.680> all," + }, + { + "start": 2107.91, + "duration": 0.0, + "text": "I'm not going to touch that at all," + }, + { + "start": 2107.92, + "duration": 0.0, + "text": "I'm not going to touch that at all, right?<00:35:08.040> I'm<00:35:08.120> just<00:35:08.280> going<00:35:08.400> to<00:35:08.440> keep<00:35:08.680> that<00:35:08.880> where" + }, + { + "start": 2108.99, + "duration": 0.0, + "text": "right? I'm just going to keep that where" + }, + { + "start": 2109.0, + "duration": 0.0, + "text": "right? I'm just going to keep that where it<00:35:09.120> is." + }, + { + "start": 2110.03, + "duration": 0.0, + "text": "it is." + }, + { + "start": 2110.04, + "duration": 0.0, + "text": "it is. The<00:35:10.200> word<00:35:10.480> know<00:35:11.400> is<00:35:11.560> at<00:35:11.720> position<00:35:12.200> one,<00:35:12.560> so<00:35:12.640> I'm" + }, + { + "start": 2112.71, + "duration": 0.0, + "text": "The word know is at position one, so I'm" + }, + { + "start": 2112.72, + "duration": 0.0, + "text": "The word know is at position one, so I'm going<00:35:12.840> to<00:35:12.920> rotate<00:35:13.440> it<00:35:13.560> by<00:35:13.680> some<00:35:13.960> angle,<00:35:14.480> right?" + }, + { + "start": 2114.95, + "duration": 0.0, + "text": "going to rotate it by some angle, right?" + }, + { + "start": 2114.96, + "duration": 0.0, + "text": "going to rotate it by some angle, right? And<00:35:15.080> that's<00:35:15.280> my<00:35:15.800> my<00:35:15.960> one<00:35:16.200> position<00:35:16.600> rotation." + }, + { + "start": 2117.95, + "duration": 0.0, + "text": "And that's my my one position rotation." + }, + { + "start": 2117.96, + "duration": 0.0, + "text": "And that's my my one position rotation. Now,<00:35:18.160> what<00:35:18.400> happens<00:35:18.920> if<00:35:19.040> I<00:35:19.120> apply<00:35:19.400> the<00:35:19.520> same" + }, + { + "start": 2119.79, + "duration": 0.0, + "text": "Now, what happens if I apply the same" + }, + { + "start": 2119.8, + "duration": 0.0, + "text": "Now, what happens if I apply the same idea<00:35:20.920> to,<00:35:21.640> uh,<00:35:21.760> the<00:35:21.880> following<00:35:22.760> sequence,<00:35:23.280> of" + }, + { + "start": 2123.51, + "duration": 0.0, + "text": "idea to, uh, the following sequence, of" + }, + { + "start": 2123.52, + "duration": 0.0, + "text": "idea to, uh, the following sequence, of course<00:35:23.920> we<00:35:24.040> know,<00:35:24.400> right?<00:35:24.560> In<00:35:24.640> this<00:35:24.840> case,<00:35:25.440> we" + }, + { + "start": 2125.75, + "duration": 0.0, + "text": "course we know, right? In this case, we" + }, + { + "start": 2125.76, + "duration": 0.0, + "text": "course we know, right? In this case, we and<00:35:25.880> know<00:35:26.120> are<00:35:26.240> still<00:35:26.440> adjacent,<00:35:26.880> they're" + }, + { + "start": 2127.03, + "duration": 0.0, + "text": "and know are still adjacent, they're" + }, + { + "start": 2127.04, + "duration": 0.0, + "text": "and know are still adjacent, they're right<00:35:27.280> next<00:35:27.520> to<00:35:27.600> each<00:35:27.720> other,<00:35:28.320> but<00:35:28.440> their" + }, + { + "start": 2128.63, + "duration": 0.0, + "text": "right next to each other, but their" + }, + { + "start": 2128.64, + "duration": 0.0, + "text": "right next to each other, but their absolute<00:35:29.160> position<00:35:29.520> is<00:35:29.600> shifted,<00:35:30.080> right?<00:35:30.320> Of" + }, + { + "start": 2130.51, + "duration": 0.0, + "text": "absolute position is shifted, right? Of" + }, + { + "start": 2130.52, + "duration": 0.0, + "text": "absolute position is shifted, right? Of course,<00:35:31.120> you<00:35:31.200> know,<00:35:31.280> comes<00:35:31.520> before<00:35:31.760> we<00:35:31.960> know" + }, + { + "start": 2132.15, + "duration": 0.0, + "text": "course, you know, comes before we know" + }, + { + "start": 2132.16, + "duration": 0.0, + "text": "course, you know, comes before we know now.<00:35:32.920> In<00:35:33.080> this<00:35:33.280> case,<00:35:33.880> I'm<00:35:33.960> going<00:35:34.080> to<00:35:34.160> rotate" + }, + { + "start": 2134.55, + "duration": 0.0, + "text": "now. In this case, I'm going to rotate" + }, + { + "start": 2134.56, + "duration": 0.0, + "text": "now. In this case, I'm going to rotate the<00:35:34.680> word<00:35:35.080> we<00:35:35.400> by<00:35:35.560> two<00:35:35.800> positions<00:35:36.320> because" + }, + { + "start": 2136.51, + "duration": 0.0, + "text": "the word we by two positions because" + }, + { + "start": 2136.52, + "duration": 0.0, + "text": "the word we by two positions because it's<00:35:36.760> two<00:35:36.960> index,<00:35:37.359> right?<00:35:38.000> 0<00:35:38.440> 1<00:35:38.760> 2.<00:35:39.440> So,<00:35:39.560> the" + }, + { + "start": 2139.67, + "duration": 0.0, + "text": "it's two index, right? 0 1 2. So, the" + }, + { + "start": 2139.68, + "duration": 0.0, + "text": "it's two index, right? 0 1 2. So, the word<00:35:40.000> we<00:35:40.160> is<00:35:40.280> in<00:35:40.400> the<00:35:40.640> second,<00:35:41.359> uh,<00:35:41.400> position" + }, + { + "start": 2141.75, + "duration": 0.0, + "text": "word we is in the second, uh, position" + }, + { + "start": 2141.76, + "duration": 0.0, + "text": "word we is in the second, uh, position number<00:35:42.000> two,<00:35:42.359> so<00:35:42.480> I<00:35:42.560> rotate<00:35:43.000> by<00:35:43.120> two.<00:35:43.920> I<00:35:44.040> rotate" + }, + { + "start": 2144.51, + "duration": 0.0, + "text": "number two, so I rotate by two. I rotate" + }, + { + "start": 2144.52, + "duration": 0.0, + "text": "number two, so I rotate by two. I rotate know<00:35:44.880> by<00:35:45.040> three<00:35:45.359> positions<00:35:45.840> cuz<00:35:45.960> it's<00:35:46.080> in" + }, + { + "start": 2146.15, + "duration": 0.0, + "text": "know by three positions cuz it's in" + }, + { + "start": 2146.16, + "duration": 0.0, + "text": "know by three positions cuz it's in position<00:35:46.520> number<00:35:46.760> three,<00:35:47.560> and<00:35:47.720> what<00:35:47.880> do<00:35:47.960> you" + }, + { + "start": 2148.03, + "duration": 0.0, + "text": "position number three, and what do you" + }, + { + "start": 2148.04, + "duration": 0.0, + "text": "position number three, and what do you know,<00:35:48.280> the<00:35:48.440> relative<00:35:48.960> angle<00:35:49.280> between<00:35:49.640> these" + }, + { + "start": 2149.87, + "duration": 0.0, + "text": "know, the relative angle between these" + }, + { + "start": 2149.88, + "duration": 0.0, + "text": "know, the relative angle between these two<00:35:50.440> is<00:35:50.560> still<00:35:50.760> separated<00:35:51.200> by<00:35:51.359> one,<00:35:51.800> right?" + }, + { + "start": 2152.63, + "duration": 0.0, + "text": "two is still separated by one, right?" + }, + { + "start": 2152.64, + "duration": 0.0, + "text": "two is still separated by one, right? So,<00:35:52.880> this<00:35:53.080> is<00:35:53.160> a<00:35:53.200> very,<00:35:53.600> very<00:35:53.840> simple<00:35:54.240> idea<00:35:54.960> of" + }, + { + "start": 2155.11, + "duration": 0.0, + "text": "So, this is a very, very simple idea of" + }, + { + "start": 2155.12, + "duration": 0.0, + "text": "So, this is a very, very simple idea of just<00:35:55.359> using<00:35:55.640> rotations,<00:35:56.760> uh,<00:35:56.880> to<00:35:57.080> represent," + }, + { + "start": 2158.23, + "duration": 0.0, + "text": "just using rotations, uh, to represent," + }, + { + "start": 2158.24, + "duration": 0.0, + "text": "just using rotations, uh, to represent, um," + }, + { + "start": 2159.31, + "duration": 0.0, + "text": "um," + }, + { + "start": 2159.32, + "duration": 0.0, + "text": "um, position.<00:35:59.800> And<00:35:59.880> if<00:36:00.000> we<00:36:00.120> do<00:36:00.280> that,<00:36:00.760> then" + }, + { + "start": 2160.95, + "duration": 0.0, + "text": "position. And if we do that, then" + }, + { + "start": 2160.96, + "duration": 0.0, + "text": "position. And if we do that, then anytime<00:36:01.320> we<00:36:01.400> take<00:36:01.600> an<00:36:01.720> inner<00:36:01.920> product,<00:36:02.480> those" + }, + { + "start": 2162.67, + "duration": 0.0, + "text": "anytime we take an inner product, those" + }, + { + "start": 2162.68, + "duration": 0.0, + "text": "anytime we take an inner product, those inner<00:36:02.920> products<00:36:03.320> are<00:36:03.400> going<00:36:03.520> to<00:36:03.600> be<00:36:03.720> invariant" + }, + { + "start": 2164.67, + "duration": 0.0, + "text": "inner products are going to be invariant" + }, + { + "start": 2164.68, + "duration": 0.0, + "text": "inner products are going to be invariant of<00:36:04.840> absolute<00:36:05.320> positions." + }, + { + "start": 2166.71, + "duration": 0.0, + "text": "of absolute positions." + }, + { + "start": 2166.72, + "duration": 0.0, + "text": "of absolute positions. Now,<00:36:06.960> you<00:36:07.080> might<00:36:07.280> say,<00:36:07.920> well,<00:36:08.280> in<00:36:08.440> two" + }, + { + "start": 2168.63, + "duration": 0.0, + "text": "Now, you might say, well, in two" + }, + { + "start": 2168.64, + "duration": 0.0, + "text": "Now, you might say, well, in two dimensions<00:36:09.120> that's<00:36:09.320> pretty<00:36:09.600> easy<00:36:10.000> cuz<00:36:10.160> you've" + }, + { + "start": 2170.31, + "duration": 0.0, + "text": "dimensions that's pretty easy cuz you've" + }, + { + "start": 2170.32, + "duration": 0.0, + "text": "dimensions that's pretty easy cuz you've only<00:36:10.560> really<00:36:10.800> got<00:36:11.000> one<00:36:11.160> choice,<00:36:11.480> you<00:36:11.560> got" + }, + { + "start": 2171.79, + "duration": 0.0, + "text": "only really got one choice, you got" + }, + { + "start": 2171.8, + "duration": 0.0, + "text": "only really got one choice, you got clockwise<00:36:12.359> and<00:36:12.480> counterclockwise,<00:36:13.720> but<00:36:13.840> in" + }, + { + "start": 2173.95, + "duration": 0.0, + "text": "clockwise and counterclockwise, but in" + }, + { + "start": 2173.96, + "duration": 0.0, + "text": "clockwise and counterclockwise, but in high<00:36:14.160> dimensions,<00:36:15.040> there's<00:36:15.240> an<00:36:15.400> infinite" + }, + { + "start": 2176.19, + "duration": 0.0, + "text": "high dimensions, there's an infinite" + }, + { + "start": 2176.2, + "duration": 0.0, + "text": "high dimensions, there's an infinite space<00:36:16.640> of<00:36:16.800> ways<00:36:17.080> that<00:36:17.200> you<00:36:17.280> can<00:36:17.440> rotate" + }, + { + "start": 2177.79, + "duration": 0.0, + "text": "space of ways that you can rotate" + }, + { + "start": 2177.8, + "duration": 0.0, + "text": "space of ways that you can rotate vectors.<00:36:18.240> So,<00:36:18.359> what<00:36:18.480> do<00:36:18.600> you<00:36:18.680> do<00:36:18.840> in<00:36:18.960> D" + }, + { + "start": 2179.11, + "duration": 0.0, + "text": "vectors. So, what do you do in D" + }, + { + "start": 2179.12, + "duration": 0.0, + "text": "vectors. So, what do you do in D dimensions?<00:36:20.080> Um,<00:36:20.680> well,<00:36:20.840> you<00:36:20.920> do<00:36:21.040> the" + }, + { + "start": 2181.15, + "duration": 0.0, + "text": "dimensions? Um, well, you do the" + }, + { + "start": 2181.16, + "duration": 0.0, + "text": "dimensions? Um, well, you do the simplest<00:36:21.600> possible<00:36:22.080> thing<00:36:22.359> and<00:36:22.480> it<00:36:22.600> works." + }, + { + "start": 2182.95, + "duration": 0.0, + "text": "simplest possible thing and it works." + }, + { + "start": 2182.96, + "duration": 0.0, + "text": "simplest possible thing and it works. The<00:36:23.040> simplest<00:36:23.440> possible<00:36:23.880> thing<00:36:24.320> is<00:36:24.520> to<00:36:25.000> reduce" + }, + { + "start": 2185.39, + "duration": 0.0, + "text": "The simplest possible thing is to reduce" + }, + { + "start": 2185.4, + "duration": 0.0, + "text": "The simplest possible thing is to reduce it<00:36:25.520> to<00:36:25.600> the<00:36:25.720> 2D<00:36:26.080> case<00:36:26.320> repeatedly.<00:36:27.120> So,<00:36:27.200> you" + }, + { + "start": 2187.23, + "duration": 0.0, + "text": "it to the 2D case repeatedly. So, you" + }, + { + "start": 2187.24, + "duration": 0.0, + "text": "it to the 2D case repeatedly. So, you have<00:36:27.320> a<00:36:27.359> D-dimensional<00:36:28.040> vector,<00:36:28.600> just<00:36:28.840> cut<00:36:29.080> it" + }, + { + "start": 2189.23, + "duration": 0.0, + "text": "have a D-dimensional vector, just cut it" + }, + { + "start": 2189.24, + "duration": 0.0, + "text": "have a D-dimensional vector, just cut it up<00:36:29.760> into<00:36:29.960> chunks<00:36:30.320> of<00:36:30.520> two,<00:36:31.160> and<00:36:31.480> each<00:36:31.880> pair<00:36:32.400> of" + }, + { + "start": 2192.63, + "duration": 0.0, + "text": "up into chunks of two, and each pair of" + }, + { + "start": 2192.64, + "duration": 0.0, + "text": "up into chunks of two, and each pair of two<00:36:32.960> dimensions<00:36:33.840> gets<00:36:34.160> rotated.<00:36:35.120> And<00:36:35.240> the" + }, + { + "start": 2195.349, + "duration": 0.0, + "text": "two dimensions gets rotated. And the" + }, + { + "start": 2195.359, + "duration": 0.0, + "text": "two dimensions gets rotated. And the theta<00:36:35.760> at<00:36:35.880> which<00:36:36.080> these<00:36:36.280> things<00:36:36.560> rotate<00:36:37.200> vary," + }, + { + "start": 2197.83, + "duration": 0.0, + "text": "theta at which these things rotate vary," + }, + { + "start": 2197.84, + "duration": 0.0, + "text": "theta at which these things rotate vary, right?<00:36:38.400> Some<00:36:38.600> of<00:36:38.680> them<00:36:38.840> are<00:36:38.960> very<00:36:39.160> low" + }, + { + "start": 2199.39, + "duration": 0.0, + "text": "right? Some of them are very low" + }, + { + "start": 2199.4, + "duration": 0.0, + "text": "right? Some of them are very low frequency,<00:36:39.920> so<00:36:40.040> they<00:36:40.160> rotate<00:36:40.520> very<00:36:40.680> slowly," + }, + { + "start": 2201.23, + "duration": 0.0, + "text": "frequency, so they rotate very slowly," + }, + { + "start": 2201.24, + "duration": 0.0, + "text": "frequency, so they rotate very slowly, so<00:36:41.320> they<00:36:41.960> uh,<00:36:42.080> they<00:36:42.200> can<00:36:42.359> capture<00:36:42.720> long-range" + }, + { + "start": 2203.31, + "duration": 0.0, + "text": "so they uh, they can capture long-range" + }, + { + "start": 2203.32, + "duration": 0.0, + "text": "so they uh, they can capture long-range dependence.<00:36:44.200> Some<00:36:44.440> of<00:36:44.520> them<00:36:44.680> rotate<00:36:45.040> very" + }, + { + "start": 2205.23, + "duration": 0.0, + "text": "dependence. Some of them rotate very" + }, + { + "start": 2205.24, + "duration": 0.0, + "text": "dependence. Some of them rotate very quickly,<00:36:45.680> so<00:36:45.800> they<00:36:45.920> capture<00:36:46.320> things<00:36:46.560> like," + }, + { + "start": 2206.75, + "duration": 0.0, + "text": "quickly, so they capture things like," + }, + { + "start": 2206.76, + "duration": 0.0, + "text": "quickly, so they capture things like, are<00:36:46.880> they<00:36:47.040> neighbors<00:36:47.520> to<00:36:47.640> each<00:36:47.800> other,<00:36:48.120> right?" + }, + { + "start": 2208.95, + "duration": 0.0, + "text": "are they neighbors to each other, right?" + }, + { + "start": 2208.96, + "duration": 0.0, + "text": "are they neighbors to each other, right? Um,<00:36:49.600> and<00:36:49.760> then<00:36:50.040> at<00:36:50.160> the<00:36:50.320> end,<00:36:50.800> you<00:36:50.880> know,<00:36:51.000> after" + }, + { + "start": 2211.23, + "duration": 0.0, + "text": "Um, and then at the end, you know, after" + }, + { + "start": 2211.24, + "duration": 0.0, + "text": "Um, and then at the end, you know, after I've<00:36:51.440> rotated<00:36:51.920> every<00:36:52.200> pair<00:36:52.440> of<00:36:52.520> vectors,<00:36:53.240> I" + }, + { + "start": 2213.349, + "duration": 0.0, + "text": "I've rotated every pair of vectors, I" + }, + { + "start": 2213.359, + "duration": 0.0, + "text": "I've rotated every pair of vectors, I get<00:36:53.600> sort<00:36:53.760> of<00:36:53.840> my<00:36:54.000> final<00:36:54.680> embeddings.<00:36:55.200> So," + }, + { + "start": 2215.27, + "duration": 0.0, + "text": "get sort of my final embeddings. So," + }, + { + "start": 2215.28, + "duration": 0.0, + "text": "get sort of my final embeddings. So, this<00:36:55.440> is<00:36:55.560> the,<00:36:56.080> you<00:36:56.200> know,<00:36:57.000> rope<00:36:57.240> approach." + }, + { + "start": 2218.23, + "duration": 0.0, + "text": "this is the, you know, rope approach." + }, + { + "start": 2218.24, + "duration": 0.0, + "text": "this is the, you know, rope approach. Um," + }, + { + "start": 2218.83, + "duration": 0.0, + "text": "Um," + }, + { + "start": 2218.84, + "duration": 0.0, + "text": "Um, the<00:36:58.960> paper,<00:36:59.440> if<00:36:59.600> you<00:37:00.000> read<00:37:00.280> it,<00:37:00.720> has<00:37:00.880> a<00:37:00.920> very" + }, + { + "start": 2221.43, + "duration": 0.0, + "text": "the paper, if you read it, has a very" + }, + { + "start": 2221.44, + "duration": 0.0, + "text": "the paper, if you read it, has a very complex<00:37:02.160> motivation<00:37:02.840> about<00:37:03.080> complex" + }, + { + "start": 2223.51, + "duration": 0.0, + "text": "complex motivation about complex" + }, + { + "start": 2223.52, + "duration": 0.0, + "text": "complex motivation about complex numbers,<00:37:03.960> but<00:37:04.160> really<00:37:05.000> I<00:37:05.080> think<00:37:05.280> the" + }, + { + "start": 2225.39, + "duration": 0.0, + "text": "numbers, but really I think the" + }, + { + "start": 2225.4, + "duration": 0.0, + "text": "numbers, but really I think the intuitive<00:37:05.920> way,<00:37:06.080> at<00:37:06.200> least<00:37:06.440> to<00:37:06.520> me,<00:37:06.640> to<00:37:06.760> think" + }, + { + "start": 2226.91, + "duration": 0.0, + "text": "intuitive way, at least to me, to think" + }, + { + "start": 2226.92, + "duration": 0.0, + "text": "intuitive way, at least to me, to think about<00:37:07.240> it,<00:37:07.600> is<00:37:07.760> to<00:37:07.880> just<00:37:08.560> you<00:37:08.760> want<00:37:09.000> to<00:37:09.120> rotate" + }, + { + "start": 2229.47, + "duration": 0.0, + "text": "about it, is to just you want to rotate" + }, + { + "start": 2229.48, + "duration": 0.0, + "text": "about it, is to just you want to rotate by<00:37:09.600> reducing<00:37:10.160> to<00:37:10.280> the<00:37:10.400> two-dimensional<00:37:11.080> case," + }, + { + "start": 2231.39, + "duration": 0.0, + "text": "by reducing to the two-dimensional case," + }, + { + "start": 2231.4, + "duration": 0.0, + "text": "by reducing to the two-dimensional case, and<00:37:11.520> you're<00:37:11.600> just<00:37:11.840> rotating<00:37:12.320> every<00:37:12.560> pair<00:37:12.800> of" + }, + { + "start": 2232.91, + "duration": 0.0, + "text": "and you're just rotating every pair of" + }, + { + "start": 2232.92, + "duration": 0.0, + "text": "and you're just rotating every pair of coordinates." + }, + { + "start": 2234.23, + "duration": 0.0, + "text": "coordinates." + }, + { + "start": 2234.24, + "duration": 0.0, + "text": "coordinates. Um," + }, + { + "start": 2235.55, + "duration": 0.0, + "text": "Um," + }, + { + "start": 2235.56, + "duration": 0.0, + "text": "Um, Gemma<00:37:15.960> 4<00:37:16.240> just<00:37:16.480> came<00:37:16.680> out<00:37:16.840> on<00:37:17.000> Thursday,<00:37:17.840> and" + }, + { + "start": 2237.99, + "duration": 0.0, + "text": "Gemma 4 just came out on Thursday, and" + }, + { + "start": 2238.0, + "duration": 0.0, + "text": "Gemma 4 just came out on Thursday, and they<00:37:18.160> have<00:37:18.400> like<00:37:18.640> another<00:37:19.160> different<00:37:19.520> kind<00:37:19.680> of" + }, + { + "start": 2239.75, + "duration": 0.0, + "text": "they have like another different kind of" + }, + { + "start": 2239.76, + "duration": 0.0, + "text": "they have like another different kind of fun<00:37:19.960> thing<00:37:20.080> that<00:37:20.240> they<00:37:20.359> do,<00:37:20.640> which<00:37:20.840> they<00:37:20.960> call," + }, + { + "start": 2241.63, + "duration": 0.0, + "text": "fun thing that they do, which they call," + }, + { + "start": 2241.64, + "duration": 0.0, + "text": "fun thing that they do, which they call, um,<00:37:21.680> I<00:37:21.720> think<00:37:21.920> proportional<00:37:22.880> rope<00:37:23.160> or<00:37:23.320> P-rope," + }, + { + "start": 2244.47, + "duration": 0.0, + "text": "um, I think proportional rope or P-rope," + }, + { + "start": 2244.48, + "duration": 0.0, + "text": "um, I think proportional rope or P-rope, um,<00:37:25.240> which<00:37:25.440> is<00:37:25.520> a<00:37:25.600> really<00:37:25.840> strange<00:37:26.240> way<00:37:26.359> to" + }, + { + "start": 2246.47, + "duration": 0.0, + "text": "um, which is a really strange way to" + }, + { + "start": 2246.48, + "duration": 0.0, + "text": "um, which is a really strange way to just<00:37:26.680> say<00:37:26.800> that<00:37:26.920> the<00:37:27.000> only<00:37:27.240> thing<00:37:27.400> they<00:37:27.520> rotate" + }, + { + "start": 2247.91, + "duration": 0.0, + "text": "just say that the only thing they rotate" + }, + { + "start": 2247.92, + "duration": 0.0, + "text": "just say that the only thing they rotate is<00:37:28.040> the<00:37:28.120> first<00:37:28.440> two<00:37:28.560> coordinates,<00:37:29.040> but<00:37:29.160> that's" + }, + { + "start": 2249.31, + "duration": 0.0, + "text": "is the first two coordinates, but that's" + }, + { + "start": 2249.32, + "duration": 0.0, + "text": "is the first two coordinates, but that's another<00:37:29.680> valid<00:37:30.000> thing<00:37:30.520> that<00:37:30.680> you<00:37:30.760> can<00:37:30.920> do<00:37:31.120> as" + }, + { + "start": 2251.27, + "duration": 0.0, + "text": "another valid thing that you can do as" + }, + { + "start": 2251.28, + "duration": 0.0, + "text": "another valid thing that you can do as well.<00:37:31.440> So,<00:37:31.520> there's<00:37:31.680> a<00:37:31.760> lot<00:37:32.000> of<00:37:32.080> different" + }, + { + "start": 2252.39, + "duration": 0.0, + "text": "well. So, there's a lot of different" + }, + { + "start": 2252.4, + "duration": 0.0, + "text": "well. So, there's a lot of different things<00:37:32.680> that<00:37:32.800> you<00:37:32.880> can<00:37:33.000> do<00:37:33.160> in<00:37:33.240> this<00:37:33.400> space" + }, + { + "start": 2253.67, + "duration": 0.0, + "text": "things that you can do in this space" + }, + { + "start": 2253.68, + "duration": 0.0, + "text": "things that you can do in this space that<00:37:33.840> end<00:37:34.000> up<00:37:34.160> working." + }, + { + "start": 2255.59, + "duration": 0.0, + "text": "that end up working." + }, + { + "start": 2255.6, + "duration": 0.0, + "text": "that end up working. Okay.<00:37:36.400> In<00:37:36.600> practice,<00:37:37.600> what<00:37:37.720> you're<00:37:37.880> going<00:37:38.000> to" + }, + { + "start": 2258.39, + "duration": 0.0, + "text": "Okay. In practice, what you're going to" + }, + { + "start": 2258.4, + "duration": 0.0, + "text": "Okay. In practice, what you're going to end<00:37:38.600> up<00:37:38.760> doing<00:37:39.440> is,<00:37:39.720> you<00:37:39.800> know,<00:37:39.880> you<00:37:40.040> can<00:37:40.560> take" + }, + { + "start": 2261.15, + "duration": 0.0, + "text": "end up doing is, you know, you can take" + }, + { + "start": 2261.16, + "duration": 0.0, + "text": "end up doing is, you know, you can take your<00:37:41.320> vector<00:37:41.760> and<00:37:41.880> you<00:37:41.960> can<00:37:42.080> make<00:37:42.320> a<00:37:42.440> sparse" + }, + { + "start": 2263.23, + "duration": 0.0, + "text": "your vector and you can make a sparse" + }, + { + "start": 2263.24, + "duration": 0.0, + "text": "your vector and you can make a sparse multiply<00:37:43.880> with<00:37:44.080> sines<00:37:44.359> and<00:37:44.480> cosines,<00:37:45.400> and" + }, + { + "start": 2265.51, + "duration": 0.0, + "text": "multiply with sines and cosines, and" + }, + { + "start": 2265.52, + "duration": 0.0, + "text": "multiply with sines and cosines, and this<00:37:45.680> is<00:37:45.800> going<00:37:46.000> to<00:37:46.080> be<00:37:46.200> giving<00:37:46.520> you<00:37:46.640> some<00:37:46.920> way" + }, + { + "start": 2267.39, + "duration": 0.0, + "text": "this is going to be giving you some way" + }, + { + "start": 2267.4, + "duration": 0.0, + "text": "this is going to be giving you some way of<00:37:47.600> rotating<00:37:48.080> your<00:37:48.240> input<00:37:48.560> vectors<00:37:49.120> X's," + }, + { + "start": 2269.63, + "duration": 0.0, + "text": "of rotating your input vectors X's," + }, + { + "start": 2269.64, + "duration": 0.0, + "text": "of rotating your input vectors X's, right?<00:37:49.840> So,<00:37:50.000> X<00:37:50.720> times,<00:37:51.359> uh,<00:37:51.640> W<00:37:52.040> times<00:37:52.400> R,<00:37:52.920> this" + }, + { + "start": 2273.11, + "duration": 0.0, + "text": "right? So, X times, uh, W times R, this" + }, + { + "start": 2273.12, + "duration": 0.0, + "text": "right? So, X times, uh, W times R, this is<00:37:53.200> going<00:37:53.320> to<00:37:53.400> be<00:37:53.480> your<00:37:53.640> final<00:37:54.000> embedding<00:37:54.560> that" + }, + { + "start": 2274.75, + "duration": 0.0, + "text": "is going to be your final embedding that" + }, + { + "start": 2274.76, + "duration": 0.0, + "text": "is going to be your final embedding that you<00:37:54.880> get." + }, + { + "start": 2276.349, + "duration": 0.0, + "text": "you get." + }, + { + "start": 2276.359, + "duration": 0.0, + "text": "you get. Okay." + }, + { + "start": 2277.23, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 2277.24, + "duration": 0.0, + "text": "Okay. Um,<00:37:57.760> and<00:37:57.880> finally,<00:37:58.240> you<00:37:58.320> know,<00:37:58.440> this<00:37:58.600> is<00:37:58.720> a" + }, + { + "start": 2278.79, + "duration": 0.0, + "text": "Um, and finally, you know, this is a" + }, + { + "start": 2278.8, + "duration": 0.0, + "text": "Um, and finally, you know, this is a sine<00:37:59.080> and<00:37:59.200> cosine,<00:37:59.760> which<00:38:00.000> looks<00:38:00.280> a<00:38:00.359> little" + }, + { + "start": 2280.59, + "duration": 0.0, + "text": "sine and cosine, which looks a little" + }, + { + "start": 2280.6, + "duration": 0.0, + "text": "sine and cosine, which looks a little like<00:38:00.760> sine<00:38:01.280> embeddings,<00:38:02.120> but<00:38:02.280> it's<00:38:02.440> really" + }, + { + "start": 2282.71, + "duration": 0.0, + "text": "like sine embeddings, but it's really" + }, + { + "start": 2282.72, + "duration": 0.0, + "text": "like sine embeddings, but it's really important<00:38:03.200> that<00:38:03.320> I'm<00:38:03.480> multiplying<00:38:04.160> with" + }, + { + "start": 2284.27, + "duration": 0.0, + "text": "important that I'm multiplying with" + }, + { + "start": 2284.28, + "duration": 0.0, + "text": "important that I'm multiplying with these<00:38:04.440> sines<00:38:04.680> and<00:38:04.800> cosines<00:38:05.320> rather<00:38:05.520> than" + }, + { + "start": 2285.71, + "duration": 0.0, + "text": "these sines and cosines rather than" + }, + { + "start": 2285.72, + "duration": 0.0, + "text": "these sines and cosines rather than using<00:38:05.960> them<00:38:06.120> as<00:38:06.240> embeddings<00:38:07.080> cuz<00:38:07.240> that<00:38:07.440> means" + }, + { + "start": 2287.63, + "duration": 0.0, + "text": "using them as embeddings cuz that means" + }, + { + "start": 2287.64, + "duration": 0.0, + "text": "using them as embeddings cuz that means that<00:38:07.760> there<00:38:07.880> are<00:38:07.920> no<00:38:08.080> cross<00:38:08.440> terms.<00:38:09.240> Um,<00:38:09.359> and" + }, + { + "start": 2289.43, + "duration": 0.0, + "text": "that there are no cross terms. Um, and" + }, + { + "start": 2289.44, + "duration": 0.0, + "text": "that there are no cross terms. Um, and this<00:38:09.600> is<00:38:09.760> purely<00:38:10.160> relative,<00:38:10.640> right?<00:38:11.040> There's" + }, + { + "start": 2291.19, + "duration": 0.0, + "text": "this is purely relative, right? There's" + }, + { + "start": 2291.2, + "duration": 0.0, + "text": "this is purely relative, right? There's no<00:38:11.400> absolute<00:38:11.880> position<00:38:12.160> information<00:38:12.680> that" + }, + { + "start": 2292.79, + "duration": 0.0, + "text": "no absolute position information that" + }, + { + "start": 2292.8, + "duration": 0.0, + "text": "no absolute position information that you'll<00:38:12.920> get<00:38:13.480> out<00:38:13.640> of<00:38:13.720> inner<00:38:13.880> products." + }, + { + "start": 2295.39, + "duration": 0.0, + "text": "you'll get out of inner products." + }, + { + "start": 2295.4, + "duration": 0.0, + "text": "you'll get out of inner products. Um,<00:38:15.760> if<00:38:15.880> we<00:38:16.000> really,<00:38:16.320> really<00:38:16.600> wanted<00:38:16.840> to<00:38:16.920> get" + }, + { + "start": 2297.07, + "duration": 0.0, + "text": "Um, if we really, really wanted to get" + }, + { + "start": 2297.08, + "duration": 0.0, + "text": "Um, if we really, really wanted to get into<00:38:17.240> low-level<00:38:17.680> details<00:38:18.160> and<00:38:18.280> you<00:38:18.359> ask<00:38:18.520> like," + }, + { + "start": 2298.67, + "duration": 0.0, + "text": "into low-level details and you ask like," + }, + { + "start": 2298.68, + "duration": 0.0, + "text": "into low-level details and you ask like, how<00:38:18.800> do<00:38:18.920> I<00:38:19.000> actually<00:38:19.320> implement<00:38:19.760> this<00:38:19.880> thing," + }, + { + "start": 2300.27, + "duration": 0.0, + "text": "how do I actually implement this thing," + }, + { + "start": 2300.28, + "duration": 0.0, + "text": "how do I actually implement this thing, you<00:38:20.359> know,<00:38:20.440> you're<00:38:20.560> going<00:38:20.680> to<00:38:20.720> have<00:38:20.880> to<00:38:21.000> do" + }, + { + "start": 2301.15, + "duration": 0.0, + "text": "you know, you're going to have to do" + }, + { + "start": 2301.16, + "duration": 0.0, + "text": "you know, you're going to have to do that.<00:38:21.880> Um,<00:38:22.680> you<00:38:22.840> have<00:38:23.040> your<00:38:23.240> usual<00:38:23.560> attention" + }, + { + "start": 2303.99, + "duration": 0.0, + "text": "that. Um, you have your usual attention" + }, + { + "start": 2304.0, + "duration": 0.0, + "text": "that. Um, you have your usual attention stuff,<00:38:24.640> and<00:38:24.800> then<00:38:24.920> what<00:38:25.080> you<00:38:25.160> do<00:38:25.280> is<00:38:25.400> you" + }, + { + "start": 2305.51, + "duration": 0.0, + "text": "stuff, and then what you do is you" + }, + { + "start": 2305.52, + "duration": 0.0, + "text": "stuff, and then what you do is you generate<00:38:25.880> cosine<00:38:26.320> and<00:38:26.440> sine<00:38:26.760> angles,<00:38:27.600> um," + }, + { + "start": 2307.83, + "duration": 0.0, + "text": "generate cosine and sine angles, um," + }, + { + "start": 2307.84, + "duration": 0.0, + "text": "generate cosine and sine angles, um, based<00:38:28.160> on<00:38:28.240> the<00:38:28.359> position<00:38:28.760> IDs<00:38:29.520> of<00:38:29.680> sort<00:38:29.840> of" + }, + { + "start": 2309.95, + "duration": 0.0, + "text": "based on the position IDs of sort of" + }, + { + "start": 2309.96, + "duration": 0.0, + "text": "based on the position IDs of sort of where<00:38:30.160> your<00:38:30.280> sequence<00:38:30.760> is,<00:38:31.440> and<00:38:31.600> then<00:38:32.080> you're" + }, + { + "start": 2312.19, + "duration": 0.0, + "text": "where your sequence is, and then you're" + }, + { + "start": 2312.2, + "duration": 0.0, + "text": "where your sequence is, and then you're going<00:38:32.320> to<00:38:32.359> apply<00:38:32.720> those<00:38:32.920> cosines<00:38:33.400> and<00:38:33.560> sines" + }, + { + "start": 2314.27, + "duration": 0.0, + "text": "going to apply those cosines and sines" + }, + { + "start": 2314.28, + "duration": 0.0, + "text": "going to apply those cosines and sines onto<00:38:34.760> both<00:38:34.960> your<00:38:35.120> queries<00:38:35.600> and<00:38:35.760> keys<00:38:36.320> for<00:38:36.520> your" + }, + { + "start": 2316.67, + "duration": 0.0, + "text": "onto both your queries and keys for your" + }, + { + "start": 2316.68, + "duration": 0.0, + "text": "onto both your queries and keys for your sort<00:38:36.800> of<00:38:36.880> attention<00:38:37.320> computation,<00:38:38.440> um,<00:38:38.600> and" + }, + { + "start": 2318.67, + "duration": 0.0, + "text": "sort of attention computation, um, and" + }, + { + "start": 2318.68, + "duration": 0.0, + "text": "sort of attention computation, um, and you<00:38:38.760> can<00:38:38.880> either<00:38:39.120> apply<00:38:39.359> them<00:38:39.480> as<00:38:39.560> a<00:38:39.600> matrix" + }, + { + "start": 2319.95, + "duration": 0.0, + "text": "you can either apply them as a matrix" + }, + { + "start": 2319.96, + "duration": 0.0, + "text": "you can either apply them as a matrix multiply<00:38:40.359> or<00:38:40.440> you<00:38:40.520> can<00:38:40.640> go<00:38:40.800> through<00:38:41.000> and<00:38:41.080> apply" + }, + { + "start": 2321.27, + "duration": 0.0, + "text": "multiply or you can go through and apply" + }, + { + "start": 2321.28, + "duration": 0.0, + "text": "multiply or you can go through and apply them<00:38:41.440> manually,<00:38:42.359> uh,<00:38:42.440> just<00:38:42.680> as<00:38:42.800> a<00:38:42.840> rotation," + }, + { + "start": 2323.55, + "duration": 0.0, + "text": "them manually, uh, just as a rotation," + }, + { + "start": 2323.56, + "duration": 0.0, + "text": "them manually, uh, just as a rotation, right?<00:38:43.840> Fairly<00:38:44.080> straightforward,<00:38:44.680> and<00:38:44.760> you" + }, + { + "start": 2324.83, + "duration": 0.0, + "text": "right? Fairly straightforward, and you" + }, + { + "start": 2324.84, + "duration": 0.0, + "text": "right? Fairly straightforward, and you would<00:38:44.960> do<00:38:45.080> this<00:38:45.560> at<00:38:45.720> the<00:38:45.800> attention<00:38:46.280> level" + }, + { + "start": 2326.87, + "duration": 0.0, + "text": "would do this at the attention level" + }, + { + "start": 2326.88, + "duration": 0.0, + "text": "would do this at the attention level rather<00:38:47.160> than<00:38:47.280> at<00:38:47.359> the<00:38:47.480> very<00:38:47.680> bottom<00:38:48.400> to<00:38:48.480> sort" + }, + { + "start": 2328.63, + "duration": 0.0, + "text": "rather than at the very bottom to sort" + }, + { + "start": 2328.64, + "duration": 0.0, + "text": "rather than at the very bottom to sort of<00:38:48.720> enforce<00:38:49.200> position<00:38:49.560> invariance<00:38:50.440> every" + }, + { + "start": 2330.63, + "duration": 0.0, + "text": "of enforce position invariance every" + }, + { + "start": 2330.64, + "duration": 0.0, + "text": "of enforce position invariance every time<00:38:50.840> you're<00:38:50.960> doing<00:38:51.200> attention" + }, + { + "start": 2331.55, + "duration": 0.0, + "text": "time you're doing attention" + }, + { + "start": 2331.56, + "duration": 0.0, + "text": "time you're doing attention computations." + }, + { + "start": 2333.349, + "duration": 0.0, + "text": "computations." + }, + { + "start": 2333.359, + "duration": 0.0, + "text": "computations. Okay.<00:38:53.960> So,<00:38:54.040> that<00:38:54.160> was<00:38:54.320> rope.<00:38:55.040> Um,<00:38:55.240> it<00:38:55.359> is<00:38:55.480> a" + }, + { + "start": 2335.51, + "duration": 0.0, + "text": "Okay. So, that was rope. Um, it is a" + }, + { + "start": 2335.52, + "duration": 0.0, + "text": "Okay. So, that was rope. Um, it is a little<00:38:55.720> bit<00:38:55.880> confusing,<00:38:56.440> but<00:38:56.600> once<00:38:56.800> you" + }, + { + "start": 2336.87, + "duration": 0.0, + "text": "little bit confusing, but once you" + }, + { + "start": 2336.88, + "duration": 0.0, + "text": "little bit confusing, but once you understand<00:38:57.320> the<00:38:57.440> geometry<00:38:57.920> of<00:38:58.000> just<00:38:58.160> rotating" + }, + { + "start": 2338.63, + "duration": 0.0, + "text": "understand the geometry of just rotating" + }, + { + "start": 2338.64, + "duration": 0.0, + "text": "understand the geometry of just rotating things,<00:38:58.880> it's<00:38:59.000> actually<00:38:59.359> fairly,<00:39:00.200> uh," + }, + { + "start": 2340.31, + "duration": 0.0, + "text": "things, it's actually fairly, uh," + }, + { + "start": 2340.32, + "duration": 0.0, + "text": "things, it's actually fairly, uh, straightforward." + }, + { + "start": 2342.03, + "duration": 0.0, + "text": "straightforward." + }, + { + "start": 2342.04, + "duration": 0.0, + "text": "straightforward. Okay.<00:39:02.640> I'm<00:39:02.720> going<00:39:02.840> to<00:39:02.960> pause<00:39:03.440> here<00:39:03.760> for<00:39:04.080> one" + }, + { + "start": 2344.55, + "duration": 0.0, + "text": "Okay. I'm going to pause here for one" + }, + { + "start": 2344.56, + "duration": 0.0, + "text": "Okay. I'm going to pause here for one moment,<00:39:05.280> um,<00:39:05.440> in<00:39:05.560> case<00:39:05.840> anyone<00:39:06.200> has<00:39:06.359> any" + }, + { + "start": 2346.47, + "duration": 0.0, + "text": "moment, um, in case anyone has any" + }, + { + "start": 2346.48, + "duration": 0.0, + "text": "moment, um, in case anyone has any questions<00:39:06.840> about<00:39:07.560> the<00:39:07.640> various<00:39:08.200> like" + }, + { + "start": 2348.47, + "duration": 0.0, + "text": "questions about the various like" + }, + { + "start": 2348.48, + "duration": 0.0, + "text": "questions about the various like architecture<00:39:09.080> bits.<00:39:09.720> Um,<00:39:10.120> we're<00:39:10.200> going<00:39:10.320> to" + }, + { + "start": 2350.39, + "duration": 0.0, + "text": "architecture bits. Um, we're going to" + }, + { + "start": 2350.4, + "duration": 0.0, + "text": "architecture bits. Um, we're going to then<00:39:10.600> talk<00:39:10.840> about<00:39:11.960> even<00:39:12.280> lower-level<00:39:12.840> details" + }, + { + "start": 2353.19, + "duration": 0.0, + "text": "then talk about even lower-level details" + }, + { + "start": 2353.2, + "duration": 0.0, + "text": "then talk about even lower-level details about<00:39:13.440> hyper<00:39:13.760> parameters.<00:39:14.240> So,<00:39:14.359> yes." + }, + { + "start": 2356.79, + "duration": 0.0, + "text": "about hyper parameters. So, yes." + }, + { + "start": 2356.8, + "duration": 0.0, + "text": "about hyper parameters. So, yes. Do<00:39:16.880> you<00:39:16.920> know<00:39:17.000> about<00:39:17.160> any<00:39:17.280> papers<00:39:17.640> that<00:39:17.760> do<00:39:17.880> a" + }, + { + "start": 2357.91, + "duration": 0.0, + "text": "Do you know about any papers that do a" + }, + { + "start": 2357.92, + "duration": 0.0, + "text": "Do you know about any papers that do a higher-dimensional<00:39:18.680> rotation?" + }, + { + "start": 2359.63, + "duration": 0.0, + "text": "higher-dimensional rotation?" + }, + { + "start": 2359.64, + "duration": 0.0, + "text": "higher-dimensional rotation? Higher-dimensional<00:39:20.520> rotation<00:39:21.040> never" + }, + { + "start": 2361.27, + "duration": 0.0, + "text": "Higher-dimensional rotation never" + }, + { + "start": 2361.28, + "duration": 0.0, + "text": "Higher-dimensional rotation never worked?" + }, + { + "start": 2362.03, + "duration": 0.0, + "text": "worked?" + }, + { + "start": 2362.04, + "duration": 0.0, + "text": "worked? It's<00:39:22.160> a<00:39:22.200> good<00:39:22.480> question." + }, + { + "start": 2363.91, + "duration": 0.0, + "text": "It's a good question." + }, + { + "start": 2363.92, + "duration": 0.0, + "text": "It's a good question. I<00:39:24.040> don't<00:39:24.320> think<00:39:24.520> so.<00:39:24.680> By<00:39:24.800> a" + }, + { + "start": 2364.87, + "duration": 0.0, + "text": "I don't think so. By a" + }, + { + "start": 2364.88, + "duration": 0.0, + "text": "I don't think so. By a higher-dimensional<00:39:25.560> rotation,<00:39:26.080> like<00:39:26.280> any," + }, + { + "start": 2366.71, + "duration": 0.0, + "text": "higher-dimensional rotation, like any," + }, + { + "start": 2366.72, + "duration": 0.0, + "text": "higher-dimensional rotation, like any, you<00:39:26.760> know,<00:39:26.880> 2D<00:39:27.280> rotation<00:39:28.240> in<00:39:28.359> the<00:39:28.440> space<00:39:28.800> would" + }, + { + "start": 2368.87, + "duration": 0.0, + "text": "you know, 2D rotation in the space would" + }, + { + "start": 2368.88, + "duration": 0.0, + "text": "you know, 2D rotation in the space would just<00:39:29.080> be<00:39:29.200> kind<00:39:29.359> of<00:39:29.480> a<00:39:29.720> variant<00:39:30.240> of<00:39:30.320> this.<00:39:31.000> You" + }, + { + "start": 2371.15, + "duration": 0.0, + "text": "just be kind of a variant of this. You" + }, + { + "start": 2371.16, + "duration": 0.0, + "text": "just be kind of a variant of this. You could<00:39:31.440> certainly<00:39:31.800> do<00:39:31.960> like<00:39:32.600> any<00:39:32.880> one<00:39:33.120> manifold" + }, + { + "start": 2373.67, + "duration": 0.0, + "text": "could certainly do like any one manifold" + }, + { + "start": 2373.68, + "duration": 0.0, + "text": "could certainly do like any one manifold that<00:39:33.800> like<00:39:33.960> is<00:39:34.080> a<00:39:34.120> closed<00:39:34.520> loop.<00:39:34.760> I<00:39:34.800> have<00:39:34.960> not" + }, + { + "start": 2375.15, + "duration": 0.0, + "text": "that like is a closed loop. I have not" + }, + { + "start": 2375.16, + "duration": 0.0, + "text": "that like is a closed loop. I have not seen<00:39:35.320> that." + }, + { + "start": 2376.87, + "duration": 0.0, + "text": "seen that." + }, + { + "start": 2376.88, + "duration": 0.0, + "text": "seen that. Yes.<00:39:37.560> What<00:39:37.880> do<00:39:37.960> you<00:39:38.000> recommend<00:39:38.320> for<00:39:38.440> this?" + }, + { + "start": 2379.03, + "duration": 0.0, + "text": "Yes. What do you recommend for this?" + }, + { + "start": 2379.04, + "duration": 0.0, + "text": "Yes. What do you recommend for this? What<00:39:39.160> do<00:39:39.240> you<00:39:39.280> think<00:39:39.680> is<00:39:39.800> the<00:39:39.880> best<00:39:40.200> way<00:39:40.359> to" + }, + { + "start": 2380.47, + "duration": 0.0, + "text": "What do you think is the best way to" + }, + { + "start": 2380.48, + "duration": 0.0, + "text": "What do you think is the best way to distill<00:39:40.840> this<00:39:41.000> kind<00:39:41.160> of<00:39:41.320> knowledge<00:39:41.720> problem." + }, + { + "start": 2383.47, + "duration": 0.0, + "text": "distill this kind of knowledge problem." + }, + { + "start": 2383.48, + "duration": 0.0, + "text": "distill this kind of knowledge problem. People<00:39:43.840> who<00:39:43.920> are<00:39:44.000> back<00:39:44.240> to<00:39:44.440> work<00:39:44.560> boards." + }, + { + "start": 2385.63, + "duration": 0.0, + "text": "People who are back to work boards." + }, + { + "start": 2385.64, + "duration": 0.0, + "text": "People who are back to work boards. It's<00:39:45.840> a<00:39:45.880> good<00:39:46.120> question.<00:39:47.080> Um" + }, + { + "start": 2389.16, + "duration": 0.0, + "text": "I<00:39:49.240> don't<00:39:49.480> know<00:39:49.640> if<00:39:49.760> there's<00:39:50.000> a<00:39:50.040> way<00:39:50.560> beyond" + }, + { + "start": 2391.71, + "duration": 0.0, + "text": "I don't know if there's a way beyond" + }, + { + "start": 2391.72, + "duration": 0.0, + "text": "I don't know if there's a way beyond some<00:39:51.960> combination<00:39:52.600> of<00:39:52.720> like<00:39:52.880> looking<00:39:53.200> broadly" + }, + { + "start": 2393.67, + "duration": 0.0, + "text": "some combination of like looking broadly" + }, + { + "start": 2393.68, + "duration": 0.0, + "text": "some combination of like looking broadly enough<00:39:54.000> to<00:39:54.120> get<00:39:54.320> a<00:39:54.600> to<00:39:54.680> get<00:39:54.800> to<00:39:54.840> get<00:39:54.960> a<00:39:55.000> pattern," + }, + { + "start": 2395.31, + "duration": 0.0, + "text": "enough to get a to get to get a pattern," + }, + { + "start": 2395.32, + "duration": 0.0, + "text": "enough to get a to get to get a pattern, which<00:39:55.480> is<00:39:55.600> what<00:39:55.720> I<00:39:55.920> the<00:39:56.120> procedure<00:39:56.560> I'm<00:39:56.640> trying" + }, + { + "start": 2396.79, + "duration": 0.0, + "text": "which is what I the procedure I'm trying" + }, + { + "start": 2396.8, + "duration": 0.0, + "text": "which is what I the procedure I'm trying to<00:39:56.880> do<00:39:57.000> in<00:39:57.040> this<00:39:57.200> lecture<00:39:57.480> here.<00:39:58.080> And<00:39:58.200> then<00:39:58.320> the" + }, + { + "start": 2398.43, + "duration": 0.0, + "text": "to do in this lecture here. And then the" + }, + { + "start": 2398.44, + "duration": 0.0, + "text": "to do in this lecture here. And then the other<00:39:58.640> one<00:39:58.760> is<00:39:58.880> to<00:39:59.000> try<00:39:59.240> it<00:39:59.360> yourself<00:39:59.800> even<00:40:00.040> a" + }, + { + "start": 2400.07, + "duration": 0.0, + "text": "other one is to try it yourself even a" + }, + { + "start": 2400.08, + "duration": 0.0, + "text": "other one is to try it yourself even a much<00:40:00.400> smaller<00:40:00.760> scale<00:40:01.200> to<00:40:01.320> form<00:40:01.520> an<00:40:01.600> intuition" + }, + { + "start": 2402.07, + "duration": 0.0, + "text": "much smaller scale to form an intuition" + }, + { + "start": 2402.08, + "duration": 0.0, + "text": "much smaller scale to form an intuition and<00:40:02.160> like<00:40:02.320> a<00:40:02.360> theory<00:40:02.880> for<00:40:03.040> how<00:40:03.200> these<00:40:03.400> things" + }, + { + "start": 2403.63, + "duration": 0.0, + "text": "and like a theory for how these things" + }, + { + "start": 2403.64, + "duration": 0.0, + "text": "and like a theory for how these things come<00:40:03.800> together.<00:40:04.400> I<00:40:04.480> think<00:40:04.680> those<00:40:04.920> two<00:40:05.040> are" + }, + { + "start": 2405.15, + "duration": 0.0, + "text": "come together. I think those two are" + }, + { + "start": 2405.16, + "duration": 0.0, + "text": "come together. I think those two are really<00:40:05.400> the<00:40:05.520> right<00:40:05.760> ways.<00:40:06.440> I<00:40:06.520> think<00:40:06.760> reading" + }, + { + "start": 2407.03, + "duration": 0.0, + "text": "really the right ways. I think reading" + }, + { + "start": 2407.04, + "duration": 0.0, + "text": "really the right ways. I think reading any<00:40:07.240> single<00:40:07.560> paper<00:40:07.840> in<00:40:07.920> isolation<00:40:08.400> is<00:40:08.520> very" + }, + { + "start": 2408.79, + "duration": 0.0, + "text": "any single paper in isolation is very" + }, + { + "start": 2408.8, + "duration": 0.0, + "text": "any single paper in isolation is very very<00:40:09.080> difficult<00:40:09.760> especially<00:40:10.320> now<00:40:10.640> because<00:40:11.400> no" + }, + { + "start": 2411.71, + "duration": 0.0, + "text": "very difficult especially now because no" + }, + { + "start": 2411.72, + "duration": 0.0, + "text": "very difficult especially now because no single<00:40:12.040> paper<00:40:12.360> seems<00:40:12.600> to<00:40:12.720> give<00:40:12.880> any<00:40:13.360> full" + }, + { + "start": 2413.55, + "duration": 0.0, + "text": "single paper seems to give any full" + }, + { + "start": 2413.56, + "duration": 0.0, + "text": "single paper seems to give any full detail<00:40:14.000> for<00:40:14.120> a<00:40:14.160> lot<00:40:14.400> of<00:40:14.480> language<00:40:14.760> models" + }, + { + "start": 2415.07, + "duration": 0.0, + "text": "detail for a lot of language models" + }, + { + "start": 2415.08, + "duration": 0.0, + "text": "detail for a lot of language models these<00:40:15.240> days." + }, + { + "start": 2417.8, + "duration": 0.0, + "text": "Oh,<00:40:18.240> lots<00:40:18.400> of<00:40:18.480> questions<00:40:18.760> now.<00:40:18.880> Okay,<00:40:19.080> good." + }, + { + "start": 2419.51, + "duration": 0.0, + "text": "Oh, lots of questions now. Okay, good." + }, + { + "start": 2419.52, + "duration": 0.0, + "text": "Oh, lots of questions now. Okay, good. We'll<00:40:19.640> go<00:40:19.800> in<00:40:20.320> Yeah.<00:40:20.600> Um<00:40:21.080> so<00:40:21.320> I<00:40:21.360> have<00:40:21.640> a" + }, + { + "start": 2421.67, + "duration": 0.0, + "text": "We'll go in Yeah. Um so I have a" + }, + { + "start": 2421.68, + "duration": 0.0, + "text": "We'll go in Yeah. Um so I have a question<00:40:22.360> about<00:40:22.520> the<00:40:22.560> question<00:40:22.800> on<00:40:22.880> the" + }, + { + "start": 2422.95, + "duration": 0.0, + "text": "question about the question on the" + }, + { + "start": 2422.96, + "duration": 0.0, + "text": "question about the question on the parallel<00:40:23.360> layers<00:40:23.680> and<00:40:23.960> the<00:40:24.360> the<00:40:24.440> serial" + }, + { + "start": 2424.83, + "duration": 0.0, + "text": "parallel layers and the the serial" + }, + { + "start": 2424.84, + "duration": 0.0, + "text": "parallel layers and the the serial layers.<00:40:25.400> Yeah,<00:40:25.600> I<00:40:25.680> understand<00:40:26.560> the<00:40:26.680> modern" + }, + { + "start": 2426.99, + "duration": 0.0, + "text": "layers. Yeah, I understand the modern" + }, + { + "start": 2427.0, + "duration": 0.0, + "text": "layers. Yeah, I understand the modern models<00:40:27.560> are" + }, + { + "start": 2428.19, + "duration": 0.0, + "text": "models are" + }, + { + "start": 2428.2, + "duration": 0.0, + "text": "models are thinking<00:40:28.480> of<00:40:28.640> the<00:40:28.960> resource<00:40:29.400> efficiency.<00:40:30.240> So" + }, + { + "start": 2430.35, + "duration": 0.0, + "text": "thinking of the resource efficiency. So" + }, + { + "start": 2430.36, + "duration": 0.0, + "text": "thinking of the resource efficiency. So they<00:40:30.480> will<00:40:30.600> use<00:40:30.800> the<00:40:30.920> parallel<00:40:31.320> layers.<00:40:32.120> They" + }, + { + "start": 2432.23, + "duration": 0.0, + "text": "they will use the parallel layers. They" + }, + { + "start": 2432.24, + "duration": 0.0, + "text": "they will use the parallel layers. They have<00:40:32.400> the" + }, + { + "start": 2432.99, + "duration": 0.0, + "text": "have the" + }, + { + "start": 2433.0, + "duration": 0.0, + "text": "have the idea<00:40:33.240> but<00:40:33.400> there's<00:40:34.120> there's<00:40:34.280> a<00:40:34.320> difference" + }, + { + "start": 2434.71, + "duration": 0.0, + "text": "idea but there's there's a difference" + }, + { + "start": 2434.72, + "duration": 0.0, + "text": "idea but there's there's a difference there's<00:40:34.880> a<00:40:35.240> there's<00:40:35.400> a<00:40:35.440> big<00:40:35.680> difference" + }, + { + "start": 2435.91, + "duration": 0.0, + "text": "there's a there's a big difference" + }, + { + "start": 2435.92, + "duration": 0.0, + "text": "there's a there's a big difference between<00:40:36.400> the<00:40:36.640> accuracy<00:40:37.760> um<00:40:38.120> for<00:40:38.240> these<00:40:38.400> two" + }, + { + "start": 2438.51, + "duration": 0.0, + "text": "between the accuracy um for these two" + }, + { + "start": 2438.52, + "duration": 0.0, + "text": "between the accuracy um for these two patterns,<00:40:38.840> right?<00:40:39.160> I<00:40:39.240> want<00:40:39.360> to<00:40:39.440> know<00:40:39.520> like" + }, + { + "start": 2439.67, + "duration": 0.0, + "text": "patterns, right? I want to know like" + }, + { + "start": 2439.68, + "duration": 0.0, + "text": "patterns, right? I want to know like what's<00:40:39.920> the<00:40:40.080> What's<00:40:40.200> the" + }, + { + "start": 2441.27, + "duration": 0.0, + "text": "what's the What's the" + }, + { + "start": 2441.28, + "duration": 0.0, + "text": "what's the What's the What's<00:40:41.440> the<00:40:41.560> difference<00:40:41.880> of<00:40:42.320> of<00:40:42.440> the" + }, + { + "start": 2442.51, + "duration": 0.0, + "text": "What's the difference of of the" + }, + { + "start": 2442.52, + "duration": 0.0, + "text": "What's the difference of of the accuracy?<00:40:43.000> Is<00:40:43.120> it<00:40:43.200> big<00:40:43.400> enough<00:40:43.880> to<00:40:44.400> Is<00:40:44.480> it" + }, + { + "start": 2444.55, + "duration": 0.0, + "text": "accuracy? Is it big enough to Is it" + }, + { + "start": 2444.56, + "duration": 0.0, + "text": "accuracy? Is it big enough to Is it small<00:40:44.920> enough<00:40:45.200> to<00:40:45.360> allow<00:40:45.640> the<00:40:45.840> current<00:40:46.560> model" + }, + { + "start": 2446.83, + "duration": 0.0, + "text": "small enough to allow the current model" + }, + { + "start": 2446.84, + "duration": 0.0, + "text": "small enough to allow the current model trainers<00:40:47.360> to<00:40:47.720> ignore<00:40:48.080> that<00:40:48.320> or<00:40:48.520> is<00:40:48.600> there<00:40:48.720> any" + }, + { + "start": 2449.35, + "duration": 0.0, + "text": "trainers to ignore that or is there any" + }, + { + "start": 2449.36, + "duration": 0.0, + "text": "trainers to ignore that or is there any problem?<00:40:50.000> Yeah,<00:40:50.320> I<00:40:50.400> think<00:40:51.000> you<00:40:51.080> know,<00:40:51.360> the" + }, + { + "start": 2452.39, + "duration": 0.0, + "text": "problem? Yeah, I think you know, the" + }, + { + "start": 2452.4, + "duration": 0.0, + "text": "problem? Yeah, I think you know, the that's<00:40:52.640> actually<00:40:52.880> really<00:40:53.160> mixed.<00:40:53.920> So<00:40:54.040> if<00:40:54.160> you" + }, + { + "start": 2454.27, + "duration": 0.0, + "text": "that's actually really mixed. So if you" + }, + { + "start": 2454.28, + "duration": 0.0, + "text": "that's actually really mixed. So if you read<00:40:54.480> the<00:40:54.520> original<00:40:54.960> Palm<00:40:55.240> paper,<00:40:55.600> I<00:40:55.640> think" + }, + { + "start": 2455.83, + "duration": 0.0, + "text": "read the original Palm paper, I think" + }, + { + "start": 2455.84, + "duration": 0.0, + "text": "read the original Palm paper, I think they're<00:40:55.960> like<00:40:56.160> very<00:40:56.600> confident<00:40:57.280> about<00:40:57.680> the" + }, + { + "start": 2457.75, + "duration": 0.0, + "text": "they're like very confident about the" + }, + { + "start": 2457.76, + "duration": 0.0, + "text": "they're like very confident about the use<00:40:57.920> of<00:40:58.040> parallel<00:40:58.400> layers<00:40:58.720> like<00:40:58.880> no" + }, + { + "start": 2459.07, + "duration": 0.0, + "text": "use of parallel layers like no" + }, + { + "start": 2459.08, + "duration": 0.0, + "text": "use of parallel layers like no performance<00:40:59.600> drop<00:40:59.920> 15%<00:41:00.640> systems<00:41:01.000> utilization" + }, + { + "start": 2461.55, + "duration": 0.0, + "text": "performance drop 15% systems utilization" + }, + { + "start": 2461.56, + "duration": 0.0, + "text": "performance drop 15% systems utilization improvement.<00:41:02.320> So<00:41:02.400> if<00:41:02.520> you<00:41:02.640> read<00:41:02.840> just<00:41:03.000> that" + }, + { + "start": 2463.19, + "duration": 0.0, + "text": "improvement. So if you read just that" + }, + { + "start": 2463.2, + "duration": 0.0, + "text": "improvement. So if you read just that you'll<00:41:03.320> kind<00:41:03.520> of<00:41:03.600> say<00:41:03.720> like<00:41:03.880> oh,<00:41:04.280> it's<00:41:04.480> just<00:41:04.680> as" + }, + { + "start": 2464.79, + "duration": 0.0, + "text": "you'll kind of say like oh, it's just as" + }, + { + "start": 2464.8, + "duration": 0.0, + "text": "you'll kind of say like oh, it's just as good.<00:41:05.600> Um<00:41:06.280> but<00:41:06.400> I<00:41:06.440> think<00:41:06.600> a<00:41:06.640> lot<00:41:06.920> of<00:41:07.000> the<00:41:07.400> the" + }, + { + "start": 2467.51, + "duration": 0.0, + "text": "good. Um but I think a lot of the the" + }, + { + "start": 2467.52, + "duration": 0.0, + "text": "good. Um but I think a lot of the the later<00:41:07.840> Google<00:41:08.080> models<00:41:08.480> have<00:41:08.600> stopped<00:41:08.960> using" + }, + { + "start": 2469.23, + "duration": 0.0, + "text": "later Google models have stopped using" + }, + { + "start": 2469.24, + "duration": 0.0, + "text": "later Google models have stopped using this,<00:41:09.800> which<00:41:09.960> you<00:41:10.040> can<00:41:10.160> take<00:41:10.440> on<00:41:11.000> as<00:41:11.120> an" + }, + { + "start": 2471.23, + "duration": 0.0, + "text": "this, which you can take on as an" + }, + { + "start": 2471.24, + "duration": 0.0, + "text": "this, which you can take on as an implicit<00:41:11.600> signal<00:41:11.880> that<00:41:12.000> actually<00:41:12.200> there" + }, + { + "start": 2472.31, + "duration": 0.0, + "text": "implicit signal that actually there" + }, + { + "start": 2472.32, + "duration": 0.0, + "text": "implicit signal that actually there might<00:41:12.480> be<00:41:12.600> some<00:41:13.080> losses.<00:41:13.960> And<00:41:14.080> once<00:41:14.280> again," + }, + { + "start": 2474.67, + "duration": 0.0, + "text": "might be some losses. And once again," + }, + { + "start": 2474.68, + "duration": 0.0, + "text": "might be some losses. And once again, this<00:41:14.840> one<00:41:14.960> is<00:41:15.040> a<00:41:15.120> little<00:41:15.440> bit<00:41:15.600> hard<00:41:15.920> to<00:41:16.600> um<00:41:17.120> to" + }, + { + "start": 2477.35, + "duration": 0.0, + "text": "this one is a little bit hard to um to" + }, + { + "start": 2477.36, + "duration": 0.0, + "text": "this one is a little bit hard to um to get<00:41:17.560> precise<00:41:17.920> numbers<00:41:18.280> on<00:41:18.440> because<00:41:18.680> no<00:41:18.840> one's" + }, + { + "start": 2479.03, + "duration": 0.0, + "text": "get precise numbers on because no one's" + }, + { + "start": 2479.04, + "duration": 0.0, + "text": "get precise numbers on because no one's done<00:41:19.200> the<00:41:19.280> ablations<00:41:19.880> as<00:41:20.000> far<00:41:20.160> as<00:41:20.280> I<00:41:20.320> know<00:41:20.480> on" + }, + { + "start": 2480.55, + "duration": 0.0, + "text": "done the ablations as far as I know on" + }, + { + "start": 2480.56, + "duration": 0.0, + "text": "done the ablations as far as I know on parallel<00:41:20.920> versus<00:41:21.200> serial<00:41:21.920> um<00:41:22.120> controlled" + }, + { + "start": 2482.55, + "duration": 0.0, + "text": "parallel versus serial um controlled" + }, + { + "start": 2482.56, + "duration": 0.0, + "text": "parallel versus serial um controlled nice<00:41:22.800> ablations<00:41:23.200> at<00:41:23.280> least." + }, + { + "start": 2485.27, + "duration": 0.0, + "text": "nice ablations at least." + }, + { + "start": 2485.28, + "duration": 0.0, + "text": "nice ablations at least. Yeah." + }, + { + "start": 2486.35, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 2486.36, + "duration": 0.0, + "text": "Yeah. So<00:41:26.680> yeah,<00:41:26.840> so" + }, + { + "start": 2487.75, + "duration": 0.0, + "text": "So yeah, so" + }, + { + "start": 2487.76, + "duration": 0.0, + "text": "So yeah, so what's<00:41:28.000> the<00:41:28.120> difference<00:41:28.520> between<00:41:28.840> like<00:41:29.200> Eagle" + }, + { + "start": 2489.59, + "duration": 0.0, + "text": "what's the difference between like Eagle" + }, + { + "start": 2489.6, + "duration": 0.0, + "text": "what's the difference between like Eagle and<00:41:29.840> RoPE?" + }, + { + "start": 2494.92, + "duration": 0.0, + "text": "Yeah,<00:41:35.200> yeah.<00:41:35.400> I<00:41:35.440> mean<00:41:36.080> this<00:41:36.280> difference<00:41:36.600> is" + }, + { + "start": 2496.71, + "duration": 0.0, + "text": "Yeah, yeah. I mean this difference is" + }, + { + "start": 2496.72, + "duration": 0.0, + "text": "Yeah, yeah. I mean this difference is really<00:41:36.960> just<00:41:37.120> like<00:41:37.360> which<00:41:37.640> of<00:41:37.720> the" + }, + { + "start": 2497.83, + "duration": 0.0, + "text": "really just like which of the" + }, + { + "start": 2497.84, + "duration": 0.0, + "text": "really just like which of the coordinates<00:41:38.200> you're<00:41:38.320> rotating." + }, + { + "start": 2499.43, + "duration": 0.0, + "text": "coordinates you're rotating." + }, + { + "start": 2499.44, + "duration": 0.0, + "text": "coordinates you're rotating. Like<00:41:39.560> you<00:41:39.640> don't<00:41:39.840> rotate<00:41:40.200> most<00:41:40.600> of<00:41:40.680> them" + }, + { + "start": 2500.87, + "duration": 0.0, + "text": "Like you don't rotate most of them" + }, + { + "start": 2500.88, + "duration": 0.0, + "text": "Like you don't rotate most of them because<00:41:41.200> a<00:41:41.240> lot<00:41:41.560> of<00:41:41.640> the<00:41:42.120> I<00:41:42.160> mean<00:41:42.320> the<00:41:42.440> argument" + }, + { + "start": 2502.91, + "duration": 0.0, + "text": "because a lot of the I mean the argument" + }, + { + "start": 2502.92, + "duration": 0.0, + "text": "because a lot of the I mean the argument originally<00:41:43.520> I<00:41:43.560> think<00:41:43.880> is<00:41:44.000> that<00:41:44.680> the<00:41:44.840> low" + }, + { + "start": 2505.11, + "duration": 0.0, + "text": "originally I think is that the low" + }, + { + "start": 2505.12, + "duration": 0.0, + "text": "originally I think is that the low frequency<00:41:45.720> parts<00:41:46.600> just<00:41:46.880> aren't<00:41:47.080> rotating" + }, + { + "start": 2507.47, + "duration": 0.0, + "text": "frequency parts just aren't rotating" + }, + { + "start": 2507.48, + "duration": 0.0, + "text": "frequency parts just aren't rotating very<00:41:47.680> much.<00:41:48.360> And<00:41:48.520> so<00:41:48.600> you<00:41:48.680> can<00:41:48.800> drop<00:41:49.240> them<00:41:49.560> if" + }, + { + "start": 2509.67, + "duration": 0.0, + "text": "very much. And so you can drop them if" + }, + { + "start": 2509.68, + "duration": 0.0, + "text": "very much. And so you can drop them if you're<00:41:49.840> really<00:41:50.320> strapped<00:41:50.960> for<00:41:51.480> you<00:41:51.600> know," + }, + { + "start": 2512.11, + "duration": 0.0, + "text": "you're really strapped for you know," + }, + { + "start": 2512.12, + "duration": 0.0, + "text": "you're really strapped for you know, sort<00:41:52.320> of<00:41:52.560> extra<00:41:52.920> space.<00:41:53.600> And<00:41:53.720> these<00:41:53.880> this<00:41:54.080> is" + }, + { + "start": 2514.19, + "duration": 0.0, + "text": "sort of extra space. And these this is" + }, + { + "start": 2514.2, + "duration": 0.0, + "text": "sort of extra space. And these this is really<00:41:54.520> a<00:41:54.800> optimization<00:41:55.320> for<00:41:55.440> teeny<00:41:55.720> tiny" + }, + { + "start": 2515.99, + "duration": 0.0, + "text": "really a optimization for teeny tiny" + }, + { + "start": 2516.0, + "duration": 0.0, + "text": "really a optimization for teeny tiny models<00:41:56.480> where<00:41:56.600> like<00:41:56.840> you<00:41:56.920> don't<00:41:57.120> have<00:41:57.280> very" + }, + { + "start": 2517.47, + "duration": 0.0, + "text": "models where like you don't have very" + }, + { + "start": 2517.48, + "duration": 0.0, + "text": "models where like you don't have very much<00:41:57.760> like<00:41:58.280> hidden<00:41:58.520> dimensions<00:41:59.160> to<00:41:59.440> to<00:41:59.520> have" + }, + { + "start": 2519.71, + "duration": 0.0, + "text": "much like hidden dimensions to to have" + }, + { + "start": 2519.72, + "duration": 0.0, + "text": "much like hidden dimensions to to have activations<00:42:00.240> for." + }, + { + "start": 2522.19, + "duration": 0.0, + "text": "activations for." + }, + { + "start": 2522.2, + "duration": 0.0, + "text": "activations for. Yeah." + }, + { + "start": 2523.55, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 2523.56, + "duration": 0.0, + "text": "Yeah. For<00:42:03.720> the<00:42:04.280> relative<00:42:04.880> embeddings<00:42:06.040> not<00:42:06.320> having" + }, + { + "start": 2526.59, + "duration": 0.0, + "text": "For the relative embeddings not having" + }, + { + "start": 2526.6, + "duration": 0.0, + "text": "For the relative embeddings not having an<00:42:06.760> inner<00:42:07.080> product," + }, + { + "start": 2528.31, + "duration": 0.0, + "text": "an inner product," + }, + { + "start": 2528.32, + "duration": 0.0, + "text": "an inner product, um<00:42:08.800> is<00:42:08.880> that<00:42:09.080> cuz<00:42:09.240> it<00:42:09.360> only<00:42:09.600> applies<00:42:10.040> to<00:42:10.360> keys" + }, + { + "start": 2530.51, + "duration": 0.0, + "text": "um is that cuz it only applies to keys" + }, + { + "start": 2530.52, + "duration": 0.0, + "text": "um is that cuz it only applies to keys specifically?<00:42:11.000> I'm<00:42:11.360> trying<00:42:11.560> to<00:42:11.640> understand" + }, + { + "start": 2532.11, + "duration": 0.0, + "text": "specifically? I'm trying to understand" + }, + { + "start": 2532.12, + "duration": 0.0, + "text": "specifically? I'm trying to understand the<00:42:12.200> logic.<00:42:12.600> Yeah,<00:42:12.800> so<00:42:12.920> they<00:42:13.040> applied<00:42:13.360> both" + }, + { + "start": 2533.63, + "duration": 0.0, + "text": "the logic. Yeah, so they applied both" + }, + { + "start": 2533.64, + "duration": 0.0, + "text": "the logic. Yeah, so they applied both the<00:42:13.960> the<00:42:14.120> keys<00:42:14.600> and<00:42:14.840> values,<00:42:15.400> which<00:42:15.560> is<00:42:15.680> kind" + }, + { + "start": 2535.83, + "duration": 0.0, + "text": "the the keys and values, which is kind" + }, + { + "start": 2535.84, + "duration": 0.0, + "text": "the the keys and values, which is kind of<00:42:15.960> why<00:42:16.600> you<00:42:16.680> know,<00:42:16.760> you<00:42:16.880> get<00:42:17.040> this<00:42:17.200> like" + }, + { + "start": 2537.39, + "duration": 0.0, + "text": "of why you know, you get this like" + }, + { + "start": 2537.4, + "duration": 0.0, + "text": "of why you know, you get this like relative<00:42:18.240> effect<00:42:18.840> from<00:42:19.000> where<00:42:19.200> you<00:42:19.360> are.<00:42:20.160> Um" + }, + { + "start": 2541.03, + "duration": 0.0, + "text": "relative effect from where you are. Um" + }, + { + "start": 2541.04, + "duration": 0.0, + "text": "relative effect from where you are. Um you<00:42:21.240> want<00:42:21.600> to<00:42:21.720> not<00:42:21.920> have<00:42:22.120> cross<00:42:22.520> terms,<00:42:22.960> right?" + }, + { + "start": 2543.11, + "duration": 0.0, + "text": "you want to not have cross terms, right?" + }, + { + "start": 2543.12, + "duration": 0.0, + "text": "you want to not have cross terms, right? So<00:42:23.240> so<00:42:23.360> if<00:42:23.480> you<00:42:23.600> look<00:42:23.760> at<00:42:23.880> the<00:42:23.960> sine<00:42:24.280> and<00:42:24.400> cosine" + }, + { + "start": 2544.75, + "duration": 0.0, + "text": "So so if you look at the sine and cosine" + }, + { + "start": 2544.76, + "duration": 0.0, + "text": "So so if you look at the sine and cosine embeddings,<00:42:25.640> then<00:42:25.800> you'll<00:42:26.240> not<00:42:26.520> only<00:42:26.760> get" + }, + { + "start": 2547.03, + "duration": 0.0, + "text": "embeddings, then you'll not only get" + }, + { + "start": 2547.04, + "duration": 0.0, + "text": "embeddings, then you'll not only get sort<00:42:27.200> of<00:42:27.280> the<00:42:27.920> you<00:42:28.000> know,<00:42:28.160> the<00:42:29.040> original" + }, + { + "start": 2549.55, + "duration": 0.0, + "text": "sort of the you know, the original" + }, + { + "start": 2549.56, + "duration": 0.0, + "text": "sort of the you know, the original vectors,<00:42:30.000> you'll<00:42:30.160> kind<00:42:30.320> of<00:42:30.400> get<00:42:30.560> these<00:42:30.800> weird" + }, + { + "start": 2551.31, + "duration": 0.0, + "text": "vectors, you'll kind of get these weird" + }, + { + "start": 2551.32, + "duration": 0.0, + "text": "vectors, you'll kind of get these weird cross<00:42:31.600> terms<00:42:31.800> between<00:42:32.040> the<00:42:32.120> position" + }, + { + "start": 2552.51, + "duration": 0.0, + "text": "cross terms between the position" + }, + { + "start": 2552.52, + "duration": 0.0, + "text": "cross terms between the position embeddings<00:42:33.440> and<00:42:33.640> the<00:42:34.320> word<00:42:34.600> embedding" + }, + { + "start": 2554.91, + "duration": 0.0, + "text": "embeddings and the word embedding" + }, + { + "start": 2554.92, + "duration": 0.0, + "text": "embeddings and the word embedding themselves<00:42:35.800> and<00:42:35.920> so<00:42:36.040> on<00:42:36.120> and<00:42:36.240> so<00:42:36.400> forth.<00:42:37.040> And" + }, + { + "start": 2557.15, + "duration": 0.0, + "text": "themselves and so on and so forth. And" + }, + { + "start": 2557.16, + "duration": 0.0, + "text": "themselves and so on and so forth. And then<00:42:37.280> you<00:42:37.360> can<00:42:37.480> kind<00:42:37.600> of<00:42:37.720> back<00:42:38.040> out<00:42:38.160> what<00:42:38.280> the" + }, + { + "start": 2558.39, + "duration": 0.0, + "text": "then you can kind of back out what the" + }, + { + "start": 2558.4, + "duration": 0.0, + "text": "then you can kind of back out what the absolute<00:42:38.800> position<00:42:39.200> is.<00:42:39.800> So<00:42:39.960> even<00:42:40.200> sine<00:42:40.440> and" + }, + { + "start": 2560.55, + "duration": 0.0, + "text": "absolute position is. So even sine and" + }, + { + "start": 2560.56, + "duration": 0.0, + "text": "absolute position is. So even sine and cosine<00:42:40.920> embeddings<00:42:41.240> are<00:42:41.360> not<00:42:41.560> like<00:42:41.720> pure" + }, + { + "start": 2562.39, + "duration": 0.0, + "text": "cosine embeddings are not like pure" + }, + { + "start": 2562.4, + "duration": 0.0, + "text": "cosine embeddings are not like pure relative<00:42:42.800> position<00:42:43.120> embeddings.<00:42:44.080> Um" + }, + { + "start": 2564.99, + "duration": 0.0, + "text": "relative position embeddings. Um" + }, + { + "start": 2565.0, + "duration": 0.0, + "text": "relative position embeddings. Um you<00:42:45.320> know,<00:42:45.440> you<00:42:45.520> have<00:42:45.680> to<00:42:45.760> accept<00:42:46.040> the<00:42:46.160> premise" + }, + { + "start": 2566.55, + "duration": 0.0, + "text": "you know, you have to accept the premise" + }, + { + "start": 2566.56, + "duration": 0.0, + "text": "you know, you have to accept the premise that<00:42:46.880> you<00:42:46.960> know,<00:42:47.080> the<00:42:47.240> relative<00:42:47.800> embedding<00:42:48.120> is" + }, + { + "start": 2568.23, + "duration": 0.0, + "text": "that you know, the relative embedding is" + }, + { + "start": 2568.24, + "duration": 0.0, + "text": "that you know, the relative embedding is what<00:42:48.440> you<00:42:48.520> want.<00:42:49.280> But<00:42:49.440> once<00:42:49.640> you<00:42:49.760> do<00:42:49.960> kind<00:42:50.120> of" + }, + { + "start": 2570.23, + "duration": 0.0, + "text": "what you want. But once you do kind of" + }, + { + "start": 2570.24, + "duration": 0.0, + "text": "what you want. But once you do kind of you<00:42:50.360> end<00:42:50.600> up<00:42:50.760> at<00:42:50.880> the<00:42:51.000> RoPE<00:42:51.400> solution<00:42:51.920> somewhat" + }, + { + "start": 2572.23, + "duration": 0.0, + "text": "you end up at the RoPE solution somewhat" + }, + { + "start": 2572.24, + "duration": 0.0, + "text": "you end up at the RoPE solution somewhat naturally." + }, + { + "start": 2577.76, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 2579.51, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 2579.52, + "duration": 0.0, + "text": "Yeah. Yeah." + }, + { + "start": 2580.79, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 2580.8, + "duration": 0.0, + "text": "Yeah. So<00:43:01.080> what's<00:43:01.360> the<00:43:01.440> issue<00:43:01.600> with<00:43:01.920> So<00:43:02.080> the<00:43:02.200> issue" + }, + { + "start": 2582.47, + "duration": 0.0, + "text": "So what's the issue with So the issue" + }, + { + "start": 2582.48, + "duration": 0.0, + "text": "So what's the issue with So the issue with<00:43:02.600> this<00:43:02.840> is<00:43:02.960> that<00:43:03.080> it<00:43:03.200> just<00:43:03.440> can't<00:43:03.640> be" + }, + { + "start": 2583.75, + "duration": 0.0, + "text": "with this is that it just can't be" + }, + { + "start": 2583.76, + "duration": 0.0, + "text": "with this is that it just can't be factorized<00:43:04.400> as<00:43:04.520> an<00:43:04.640> inner<00:43:04.800> product.<00:43:05.120> That's" + }, + { + "start": 2585.35, + "duration": 0.0, + "text": "factorized as an inner product. That's" + }, + { + "start": 2585.36, + "duration": 0.0, + "text": "factorized as an inner product. That's more<00:43:05.480> of<00:43:05.600> an<00:43:05.680> aesthetic<00:43:06.080> problem,<00:43:06.520> right?" + }, + { + "start": 2586.63, + "duration": 0.0, + "text": "more of an aesthetic problem, right?" + }, + { + "start": 2586.64, + "duration": 0.0, + "text": "more of an aesthetic problem, right? Like<00:43:06.840> if<00:43:07.000> if<00:43:07.120> your<00:43:07.240> constraints<00:43:07.760> are<00:43:08.360> I<00:43:08.480> need" + }, + { + "start": 2588.79, + "duration": 0.0, + "text": "Like if if your constraints are I need" + }, + { + "start": 2588.8, + "duration": 0.0, + "text": "Like if if your constraints are I need it<00:43:08.920> to<00:43:09.040> be<00:43:09.840> uh<00:43:10.200> relative<00:43:10.760> and<00:43:10.880> I<00:43:10.920> need<00:43:11.120> it<00:43:11.240> to" + }, + { + "start": 2591.31, + "duration": 0.0, + "text": "it to be uh relative and I need it to" + }, + { + "start": 2591.32, + "duration": 0.0, + "text": "it to be uh relative and I need it to factorize<00:43:12.000> as<00:43:12.160> f<00:43:12.720> of<00:43:12.960> xi<00:43:13.360> and<00:43:13.480> f<00:43:13.600> of<00:43:13.720> yj,<00:43:14.680> then" + }, + { + "start": 2594.95, + "duration": 0.0, + "text": "factorize as f of xi and f of yj, then" + }, + { + "start": 2594.96, + "duration": 0.0, + "text": "factorize as f of xi and f of yj, then this<00:43:15.160> is<00:43:15.240> not<00:43:15.440> a<00:43:15.480> solution<00:43:15.960> in<00:43:16.040> that<00:43:16.240> class.<00:43:17.000> Um" + }, + { + "start": 2597.15, + "duration": 0.0, + "text": "this is not a solution in that class. Um" + }, + { + "start": 2597.16, + "duration": 0.0, + "text": "this is not a solution in that class. Um to<00:43:17.240> be<00:43:17.360> fair,<00:43:17.720> there's<00:43:17.920> a<00:43:17.960> lot<00:43:18.240> of<00:43:18.720> um" + }, + { + "start": 2599.07, + "duration": 0.0, + "text": "to be fair, there's a lot of um" + }, + { + "start": 2599.08, + "duration": 0.0, + "text": "to be fair, there's a lot of um embeddings<00:43:19.520> that<00:43:19.680> work<00:43:19.880> this<00:43:20.040> way<00:43:20.640> that<00:43:20.840> do" + }, + { + "start": 2601.03, + "duration": 0.0, + "text": "embeddings that work this way that do" + }, + { + "start": 2601.04, + "duration": 0.0, + "text": "embeddings that work this way that do work<00:43:21.280> like<00:43:21.480> Alibi<00:43:22.160> and<00:43:22.280> other<00:43:22.440> kinds<00:43:22.680> of<00:43:22.760> like" + }, + { + "start": 2603.23, + "duration": 0.0, + "text": "work like Alibi and other kinds of like" + }, + { + "start": 2603.24, + "duration": 0.0, + "text": "work like Alibi and other kinds of like approaches<00:43:23.840> like<00:43:24.000> do<00:43:24.240> do<00:43:24.560> this<00:43:24.680> kind<00:43:24.840> of" + }, + { + "start": 2605.27, + "duration": 0.0, + "text": "approaches like do do this kind of" + }, + { + "start": 2605.28, + "duration": 0.0, + "text": "approaches like do do this kind of inject<00:43:25.800> into<00:43:26.040> the<00:43:26.120> attention<00:43:26.560> matrix<00:43:27.040> and" + }, + { + "start": 2607.11, + "duration": 0.0, + "text": "inject into the attention matrix and" + }, + { + "start": 2607.12, + "duration": 0.0, + "text": "inject into the attention matrix and they<00:43:27.240> do<00:43:28.080> reasonably<00:43:28.600> well.<00:43:29.400> Um<00:43:30.120> it's<00:43:30.520> not" + }, + { + "start": 2610.91, + "duration": 0.0, + "text": "they do reasonably well. Um it's not" + }, + { + "start": 2610.92, + "duration": 0.0, + "text": "they do reasonably well. Um it's not necessarily<00:43:31.520> the<00:43:31.600> one<00:43:31.720> that's<00:43:31.880> become<00:43:32.120> the" + }, + { + "start": 2612.19, + "duration": 0.0, + "text": "necessarily the one that's become the" + }, + { + "start": 2612.2, + "duration": 0.0, + "text": "necessarily the one that's become the dominant<00:43:32.600> approach<00:43:32.920> is<00:43:33.000> what<00:43:33.120> I<00:43:33.160> can<00:43:33.320> say." + }, + { + "start": 2616.16, + "duration": 0.0, + "text": "Cool.<00:43:36.520> Okay." + }, + { + "start": 2617.95, + "duration": 0.0, + "text": "Cool. Okay." + }, + { + "start": 2617.96, + "duration": 0.0, + "text": "Cool. Okay. Great." + }, + { + "start": 2620.48, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 2621.43, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 2621.44, + "duration": 0.0, + "text": "Okay. Now<00:43:41.560> we'll<00:43:41.680> talk<00:43:41.800> about<00:43:42.000> hyperparameters<00:43:43.320> um" + }, + { + "start": 2623.63, + "duration": 0.0, + "text": "Now we'll talk about hyperparameters um" + }, + { + "start": 2623.64, + "duration": 0.0, + "text": "Now we'll talk about hyperparameters um and<00:43:43.760> I<00:43:43.800> think<00:43:44.000> hyperparameters<00:43:44.680> are<00:43:44.760> really" + }, + { + "start": 2624.95, + "duration": 0.0, + "text": "and I think hyperparameters are really" + }, + { + "start": 2624.96, + "duration": 0.0, + "text": "and I think hyperparameters are really something<00:43:45.280> that<00:43:45.400> you<00:43:45.520> start<00:43:45.800> to<00:43:45.880> engage<00:43:46.240> with" + }, + { + "start": 2626.47, + "duration": 0.0, + "text": "something that you start to engage with" + }, + { + "start": 2626.48, + "duration": 0.0, + "text": "something that you start to engage with once<00:43:46.640> you<00:43:46.760> like<00:43:47.200> actually<00:43:47.680> have<00:43:47.840> to<00:43:47.920> train<00:43:48.160> a" + }, + { + "start": 2628.19, + "duration": 0.0, + "text": "once you like actually have to train a" + }, + { + "start": 2628.2, + "duration": 0.0, + "text": "once you like actually have to train a model,<00:43:48.680> right?<00:43:48.920> When<00:43:49.040> your<00:43:49.200> knowledge<00:43:49.600> about" + }, + { + "start": 2630.35, + "duration": 0.0, + "text": "model, right? When your knowledge about" + }, + { + "start": 2630.36, + "duration": 0.0, + "text": "model, right? When your knowledge about language<00:43:50.720> models<00:43:51.040> are<00:43:51.160> abstract,<00:43:51.840> you<00:43:51.880> don't" + }, + { + "start": 2632.03, + "duration": 0.0, + "text": "language models are abstract, you don't" + }, + { + "start": 2632.04, + "duration": 0.0, + "text": "language models are abstract, you don't have<00:43:52.160> to<00:43:52.240> care<00:43:52.480> about<00:43:52.720> any<00:43:52.880> of<00:43:53.000> these.<00:43:53.240> But" + }, + { + "start": 2633.39, + "duration": 0.0, + "text": "have to care about any of these. But" + }, + { + "start": 2633.4, + "duration": 0.0, + "text": "have to care about any of these. But once<00:43:53.560> you<00:43:53.640> have<00:43:53.760> to<00:43:53.840> instantiate<00:43:54.520> it,<00:43:54.640> you" + }, + { + "start": 2634.71, + "duration": 0.0, + "text": "once you have to instantiate it, you" + }, + { + "start": 2634.72, + "duration": 0.0, + "text": "once you have to instantiate it, you start<00:43:54.960> to<00:43:55.040> ask<00:43:55.200> questions<00:43:55.600> like<00:43:56.440> well,<00:43:56.920> how" + }, + { + "start": 2637.11, + "duration": 0.0, + "text": "start to ask questions like well, how" + }, + { + "start": 2637.12, + "duration": 0.0, + "text": "start to ask questions like well, how big<00:43:57.359> should<00:43:57.520> the<00:43:57.640> feed<00:43:57.880> forward<00:43:58.240> size<00:43:58.600> be?" + }, + { + "start": 2639.47, + "duration": 0.0, + "text": "big should the feed forward size be?" + }, + { + "start": 2639.48, + "duration": 0.0, + "text": "big should the feed forward size be? Um<00:43:59.720> how<00:43:59.840> many<00:44:00.080> heads<00:44:00.359> should<00:44:00.560> I<00:44:00.640> have?" + }, + { + "start": 2641.75, + "duration": 0.0, + "text": "Um how many heads should I have?" + }, + { + "start": 2641.76, + "duration": 0.0, + "text": "Um how many heads should I have? Um<00:44:01.840> what<00:44:02.000> should<00:44:02.120> my<00:44:02.280> vocab<00:44:02.800> size<00:44:03.200> be,<00:44:03.520> right?" + }, + { + "start": 2644.349, + "duration": 0.0, + "text": "Um what should my vocab size be, right?" + }, + { + "start": 2644.359, + "duration": 0.0, + "text": "Um what should my vocab size be, right? Um" + }, + { + "start": 2644.99, + "duration": 0.0, + "text": "Um" + }, + { + "start": 2645.0, + "duration": 0.0, + "text": "Um and<00:44:05.320> you<00:44:05.400> might<00:44:05.560> also<00:44:05.800> have<00:44:05.920> questions<00:44:06.320> of" + }, + { + "start": 2646.43, + "duration": 0.0, + "text": "and you might also have questions of" + }, + { + "start": 2646.44, + "duration": 0.0, + "text": "and you might also have questions of like<00:44:06.640> what<00:44:06.760> should<00:44:06.960> my<00:44:07.120> weight<00:44:07.359> decay<00:44:07.760> or" + }, + { + "start": 2647.87, + "duration": 0.0, + "text": "like what should my weight decay or" + }, + { + "start": 2647.88, + "duration": 0.0, + "text": "like what should my weight decay or dropout<00:44:08.480> be?<00:44:08.680> Like<00:44:08.840> do<00:44:08.960> I<00:44:09.040> even<00:44:09.240> need<00:44:09.359> to" + }, + { + "start": 2649.47, + "duration": 0.0, + "text": "dropout be? Like do I even need to" + }, + { + "start": 2649.48, + "duration": 0.0, + "text": "dropout be? Like do I even need to regularize?<00:44:09.960> I<00:44:10.040> I<00:44:10.080> have<00:44:10.320> a<00:44:10.359> lot<00:44:10.560> of<00:44:10.640> tokens," + }, + { + "start": 2651.23, + "duration": 0.0, + "text": "regularize? I I have a lot of tokens," + }, + { + "start": 2651.24, + "duration": 0.0, + "text": "regularize? I I have a lot of tokens, right?<00:44:11.560> So<00:44:11.680> do<00:44:11.800> I<00:44:11.880> need<00:44:12.160> regularization?" + }, + { + "start": 2653.59, + "duration": 0.0, + "text": "right? So do I need regularization?" + }, + { + "start": 2653.6, + "duration": 0.0, + "text": "right? So do I need regularization? Um<00:44:14.160> and<00:44:14.320> do<00:44:14.440> I<00:44:14.480> need<00:44:14.720> very<00:44:14.920> deep<00:44:15.200> models<00:44:15.520> or" + }, + { + "start": 2655.55, + "duration": 0.0, + "text": "Um and do I need very deep models or" + }, + { + "start": 2655.56, + "duration": 0.0, + "text": "Um and do I need very deep models or very<00:44:15.800> wide<00:44:16.160> models?<00:44:16.600> Like<00:44:16.720> what<00:44:16.840> are<00:44:16.920> the<00:44:17.240> the" + }, + { + "start": 2657.349, + "duration": 0.0, + "text": "very wide models? Like what are the the" + }, + { + "start": 2657.359, + "duration": 0.0, + "text": "very wide models? Like what are the the right<00:44:17.600> kinds<00:44:17.960> of<00:44:18.080> things<00:44:18.440> to<00:44:18.520> do<00:44:18.720> here,<00:44:19.000> right?" + }, + { + "start": 2659.75, + "duration": 0.0, + "text": "right kinds of things to do here, right?" + }, + { + "start": 2659.76, + "duration": 0.0, + "text": "right kinds of things to do here, right? Um<00:44:20.080> and<00:44:20.240> all<00:44:20.359> of<00:44:20.440> these<00:44:20.640> if<00:44:20.800> you<00:44:20.880> start<00:44:21.200> out" + }, + { + "start": 2661.349, + "duration": 0.0, + "text": "Um and all of these if you start out" + }, + { + "start": 2661.359, + "duration": 0.0, + "text": "Um and all of these if you start out with<00:44:21.520> no<00:44:21.720> knowledge,<00:44:22.160> it's<00:44:22.280> actually<00:44:22.520> very" + }, + { + "start": 2662.75, + "duration": 0.0, + "text": "with no knowledge, it's actually very" + }, + { + "start": 2662.76, + "duration": 0.0, + "text": "with no knowledge, it's actually very daunting<00:44:23.320> because<00:44:23.520> you<00:44:23.600> have<00:44:23.720> to<00:44:23.800> search<00:44:24.080> this" + }, + { + "start": 2664.23, + "duration": 0.0, + "text": "daunting because you have to search this" + }, + { + "start": 2664.24, + "duration": 0.0, + "text": "daunting because you have to search this like<00:44:24.440> very<00:44:24.760> big<00:44:25.000> high<00:44:25.160> dimensional<00:44:25.720> space." + }, + { + "start": 2666.75, + "duration": 0.0, + "text": "like very big high dimensional space." + }, + { + "start": 2666.76, + "duration": 0.0, + "text": "like very big high dimensional space. Um<00:44:27.200> but<00:44:27.359> the<00:44:27.440> space<00:44:27.800> of<00:44:27.880> things<00:44:28.120> that<00:44:28.280> people" + }, + { + "start": 2668.55, + "duration": 0.0, + "text": "Um but the space of things that people" + }, + { + "start": 2668.56, + "duration": 0.0, + "text": "Um but the space of things that people try<00:44:28.920> is<00:44:29.040> actually<00:44:29.400> pretty<00:44:29.720> small.<00:44:30.160> And<00:44:30.280> from" + }, + { + "start": 2670.43, + "duration": 0.0, + "text": "try is actually pretty small. And from" + }, + { + "start": 2670.44, + "duration": 0.0, + "text": "try is actually pretty small. And from that<00:44:30.680> maybe<00:44:30.920> you<00:44:31.000> can<00:44:31.120> start<00:44:31.359> to<00:44:31.440> think<00:44:31.600> about" + }, + { + "start": 2671.95, + "duration": 0.0, + "text": "that maybe you can start to think about" + }, + { + "start": 2671.96, + "duration": 0.0, + "text": "that maybe you can start to think about you<00:44:32.000> know,<00:44:32.120> smarter<00:44:32.640> search<00:44:33.000> processes<00:44:33.560> of" + }, + { + "start": 2673.67, + "duration": 0.0, + "text": "you know, smarter search processes of" + }, + { + "start": 2673.68, + "duration": 0.0, + "text": "you know, smarter search processes of like<00:44:33.880> where<00:44:34.160> you<00:44:34.280> want<00:44:34.480> to<00:44:34.520> vary<00:44:34.840> things." + }, + { + "start": 2675.99, + "duration": 0.0, + "text": "like where you want to vary things." + }, + { + "start": 2676.0, + "duration": 0.0, + "text": "like where you want to vary things. Um" + }, + { + "start": 2677.31, + "duration": 0.0, + "text": "Um" + }, + { + "start": 2677.32, + "duration": 0.0, + "text": "Um One<00:44:37.480> of<00:44:37.560> the<00:44:37.640> things<00:44:37.880> that's<00:44:38.080> a<00:44:38.120> really" + }, + { + "start": 2678.39, + "duration": 0.0, + "text": "One of the things that's a really" + }, + { + "start": 2678.4, + "duration": 0.0, + "text": "One of the things that's a really consensus<00:44:39.120> hyperparameter" + }, + { + "start": 2680.71, + "duration": 0.0, + "text": "consensus hyperparameter" + }, + { + "start": 2680.72, + "duration": 0.0, + "text": "consensus hyperparameter um<00:44:40.800> is<00:44:40.920> this<00:44:41.120> idea<00:44:41.720> of<00:44:41.960> the<00:44:42.720> uh" + }, + { + "start": 2683.27, + "duration": 0.0, + "text": "um is this idea of the uh" + }, + { + "start": 2683.28, + "duration": 0.0, + "text": "um is this idea of the uh ratio<00:44:43.880> between<00:44:44.160> the<00:44:44.280> feed<00:44:44.600> forward<00:44:44.880> size," + }, + { + "start": 2685.23, + "duration": 0.0, + "text": "ratio between the feed forward size," + }, + { + "start": 2685.24, + "duration": 0.0, + "text": "ratio between the feed forward size, which<00:44:45.400> is<00:44:45.480> kind<00:44:45.640> of<00:44:45.720> the<00:44:45.920> output<00:44:46.280> of<00:44:46.359> your" + }, + { + "start": 2686.51, + "duration": 0.0, + "text": "which is kind of the output of your" + }, + { + "start": 2686.52, + "duration": 0.0, + "text": "which is kind of the output of your first<00:44:46.920> matrix<00:44:47.359> multiply<00:44:47.920> in<00:44:48.040> an<00:44:48.160> MLP,<00:44:49.080> and<00:44:49.200> the" + }, + { + "start": 2689.27, + "duration": 0.0, + "text": "first matrix multiply in an MLP, and the" + }, + { + "start": 2689.28, + "duration": 0.0, + "text": "first matrix multiply in an MLP, and the model<00:44:49.560> dimension,<00:44:50.320> right?<00:44:50.560> So<00:44:50.680> this<00:44:50.880> is" + }, + { + "start": 2690.99, + "duration": 0.0, + "text": "model dimension, right? So this is" + }, + { + "start": 2691.0, + "duration": 0.0, + "text": "model dimension, right? So this is really<00:44:51.280> the<00:44:51.880> the<00:44:52.240> uh<00:44:52.359> ratio<00:44:52.840> of<00:44:52.920> the<00:44:53.040> two" + }, + { + "start": 2693.23, + "duration": 0.0, + "text": "really the the uh ratio of the two" + }, + { + "start": 2693.24, + "duration": 0.0, + "text": "really the the uh ratio of the two dimensions<00:44:53.680> of<00:44:53.760> your<00:44:53.880> W1<00:44:54.680> and<00:44:54.800> your<00:44:55.000> as<00:44:55.160> well" + }, + { + "start": 2695.27, + "duration": 0.0, + "text": "dimensions of your W1 and your as well" + }, + { + "start": 2695.28, + "duration": 0.0, + "text": "dimensions of your W1 and your as well your<00:44:55.440> W2<00:44:56.480> matrix." + }, + { + "start": 2697.67, + "duration": 0.0, + "text": "your W2 matrix." + }, + { + "start": 2697.68, + "duration": 0.0, + "text": "your W2 matrix. Um<00:44:57.920> this<00:44:58.120> seems<00:44:58.359> like<00:44:58.520> a<00:44:58.560> thing<00:44:58.720> that's<00:44:58.880> very" + }, + { + "start": 2699.11, + "duration": 0.0, + "text": "Um this seems like a thing that's very" + }, + { + "start": 2699.12, + "duration": 0.0, + "text": "Um this seems like a thing that's very important<00:44:59.560> and<00:44:59.640> controls<00:45:00.120> kind<00:45:00.280> of<00:45:00.359> the" + }, + { + "start": 2700.47, + "duration": 0.0, + "text": "important and controls kind of the" + }, + { + "start": 2700.48, + "duration": 0.0, + "text": "important and controls kind of the richness<00:45:01.400> of<00:45:01.560> your<00:45:01.720> MLPs.<00:45:02.680> So<00:45:02.880> what<00:45:03.080> should<00:45:03.240> it" + }, + { + "start": 2703.349, + "duration": 0.0, + "text": "richness of your MLPs. So what should it" + }, + { + "start": 2703.359, + "duration": 0.0, + "text": "richness of your MLPs. So what should it be?<00:45:04.040> Well,<00:45:04.400> for<00:45:04.840> whatever<00:45:05.280> reason,<00:45:05.680> it<00:45:05.840> should" + }, + { + "start": 2706.03, + "duration": 0.0, + "text": "be? Well, for whatever reason, it should" + }, + { + "start": 2706.04, + "duration": 0.0, + "text": "be? Well, for whatever reason, it should maybe<00:45:06.359> be<00:45:06.880> four<00:45:07.160> times<00:45:07.720> your<00:45:07.880> hidden" + }, + { + "start": 2708.11, + "duration": 0.0, + "text": "maybe be four times your hidden" + }, + { + "start": 2708.12, + "duration": 0.0, + "text": "maybe be four times your hidden dimension,<00:45:08.760> right?<00:45:09.520> Um<00:45:10.440> and<00:45:11.160> this<00:45:11.400> is<00:45:11.520> a<00:45:11.600> rule" + }, + { + "start": 2711.87, + "duration": 0.0, + "text": "dimension, right? Um and this is a rule" + }, + { + "start": 2711.88, + "duration": 0.0, + "text": "dimension, right? Um and this is a rule of<00:45:11.960> thumb<00:45:12.160> that<00:45:12.320> works<00:45:12.560> remarkably<00:45:13.560> well<00:45:14.040> and" + }, + { + "start": 2714.15, + "duration": 0.0, + "text": "of thumb that works remarkably well and" + }, + { + "start": 2714.16, + "duration": 0.0, + "text": "of thumb that works remarkably well and I<00:45:14.240> will<00:45:14.400> show<00:45:14.560> you<00:45:14.680> some<00:45:14.960> data<00:45:15.800> on<00:45:16.040> like<00:45:16.400> why" + }, + { + "start": 2716.67, + "duration": 0.0, + "text": "I will show you some data on like why" + }, + { + "start": 2716.68, + "duration": 0.0, + "text": "I will show you some data on like why maybe<00:45:17.000> this<00:45:17.160> is<00:45:17.280> a<00:45:17.320> fine<00:45:17.640> number<00:45:18.000> to<00:45:18.160> choose." + }, + { + "start": 2718.39, + "duration": 0.0, + "text": "maybe this is a fine number to choose." + }, + { + "start": 2718.4, + "duration": 0.0, + "text": "maybe this is a fine number to choose. There's<00:45:18.560> a<00:45:18.640> few<00:45:18.920> exceptions<00:45:20.160> um<00:45:20.400> and<00:45:20.520> funnily" + }, + { + "start": 2720.95, + "duration": 0.0, + "text": "There's a few exceptions um and funnily" + }, + { + "start": 2720.96, + "duration": 0.0, + "text": "There's a few exceptions um and funnily enough,<00:45:21.280> the<00:45:21.400> really<00:45:21.680> extreme<00:45:22.200> exceptions" + }, + { + "start": 2722.67, + "duration": 0.0, + "text": "enough, the really extreme exceptions" + }, + { + "start": 2722.68, + "duration": 0.0, + "text": "enough, the really extreme exceptions kind<00:45:22.800> of<00:45:22.920> backtrack<00:45:23.560> on<00:45:23.720> that." + }, + { + "start": 2725.03, + "duration": 0.0, + "text": "kind of backtrack on that." + }, + { + "start": 2725.04, + "duration": 0.0, + "text": "kind of backtrack on that. Okay." + }, + { + "start": 2726.03, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 2726.04, + "duration": 0.0, + "text": "Okay. Um<00:45:26.440> exception<00:45:27.000> number<00:45:27.359> one<00:45:28.200> is<00:45:28.560> variants<00:45:29.040> of" + }, + { + "start": 2729.15, + "duration": 0.0, + "text": "Um exception number one is variants of" + }, + { + "start": 2729.16, + "duration": 0.0, + "text": "Um exception number one is variants of the<00:45:29.280> gated<00:45:29.640> linear<00:45:29.960> unit.<00:45:30.320> I<00:45:30.400> already<00:45:30.680> told" + }, + { + "start": 2730.99, + "duration": 0.0, + "text": "the gated linear unit. I already told" + }, + { + "start": 2731.0, + "duration": 0.0, + "text": "the gated linear unit. I already told you<00:45:31.040> about<00:45:31.320> this.<00:45:31.520> So<00:45:31.600> if<00:45:31.680> you<00:45:31.760> were<00:45:31.880> thinking" + }, + { + "start": 2732.19, + "duration": 0.0, + "text": "you about this. So if you were thinking" + }, + { + "start": 2732.2, + "duration": 0.0, + "text": "you about this. So if you were thinking about<00:45:32.480> it,<00:45:32.880> this<00:45:32.960> is<00:45:33.080> probably<00:45:33.400> cached<00:45:33.800> in" + }, + { + "start": 2733.87, + "duration": 0.0, + "text": "about it, this is probably cached in" + }, + { + "start": 2733.88, + "duration": 0.0, + "text": "about it, this is probably cached in your<00:45:34.040> head,<00:45:34.359> right?<00:45:35.000> GLUs<00:45:35.760> have<00:45:36.080> more" + }, + { + "start": 2736.31, + "duration": 0.0, + "text": "your head, right? GLUs have more" + }, + { + "start": 2736.32, + "duration": 0.0, + "text": "your head, right? GLUs have more parameters,<00:45:36.960> right?<00:45:37.120> If<00:45:37.240> you<00:45:37.359> keep<00:45:37.520> the<00:45:37.640> same" + }, + { + "start": 2737.87, + "duration": 0.0, + "text": "parameters, right? If you keep the same" + }, + { + "start": 2737.88, + "duration": 0.0, + "text": "parameters, right? If you keep the same dimensions.<00:45:38.920> So<00:45:38.960> if<00:45:39.080> you<00:45:39.160> want<00:45:39.320> to<00:45:39.400> keep<00:45:39.680> the" + }, + { + "start": 2739.79, + "duration": 0.0, + "text": "dimensions. So if you want to keep the" + }, + { + "start": 2739.8, + "duration": 0.0, + "text": "dimensions. So if you want to keep the parameter<00:45:40.240> size<00:45:40.560> of<00:45:40.640> your<00:45:40.760> MLPs<00:45:41.240> the<00:45:41.359> same," + }, + { + "start": 2742.07, + "duration": 0.0, + "text": "parameter size of your MLPs the same," + }, + { + "start": 2742.08, + "duration": 0.0, + "text": "parameter size of your MLPs the same, well,<00:45:42.280> you<00:45:42.359> need<00:45:42.520> to<00:45:42.640> scale<00:45:43.000> down<00:45:43.359> by<00:45:43.520> 2/3," + }, + { + "start": 2744.31, + "duration": 0.0, + "text": "well, you need to scale down by 2/3," + }, + { + "start": 2744.32, + "duration": 0.0, + "text": "well, you need to scale down by 2/3, right?<00:45:44.920> So<00:45:45.400> most<00:45:45.800> GLU<00:45:46.240> variants,<00:45:47.120> this<00:45:47.320> means" + }, + { + "start": 2747.55, + "duration": 0.0, + "text": "right? So most GLU variants, this means" + }, + { + "start": 2747.56, + "duration": 0.0, + "text": "right? So most GLU variants, this means that<00:45:47.680> you're<00:45:47.800> going<00:45:47.920> to<00:45:48.000> end<00:45:48.240> up<00:45:48.359> with" + }, + { + "start": 2748.51, + "duration": 0.0, + "text": "that you're going to end up with" + }, + { + "start": 2748.52, + "duration": 0.0, + "text": "that you're going to end up with something<00:45:48.880> like<00:45:49.560> 2.67-ish," + }, + { + "start": 2751.39, + "duration": 0.0, + "text": "something like 2.67-ish," + }, + { + "start": 2751.4, + "duration": 0.0, + "text": "something like 2.67-ish, right?<00:45:51.560> So<00:45:51.680> everyone<00:45:52.520> that's<00:45:53.040> uh<00:45:53.400> down<00:45:53.680> here" + }, + { + "start": 2754.11, + "duration": 0.0, + "text": "right? So everyone that's uh down here" + }, + { + "start": 2754.12, + "duration": 0.0, + "text": "right? So everyone that's uh down here 2.67<00:45:55.040> to<00:45:55.200> 2.5,<00:45:56.320> this<00:45:56.440> is<00:45:56.680> roughly<00:45:57.280> applying" + }, + { + "start": 2757.87, + "duration": 0.0, + "text": "2.67 to 2.5, this is roughly applying" + }, + { + "start": 2757.88, + "duration": 0.0, + "text": "2.67 to 2.5, this is roughly applying this<00:45:58.120> like<00:45:58.400> 2/3<00:45:58.960> correction." + }, + { + "start": 2760.55, + "duration": 0.0, + "text": "this like 2/3 correction." + }, + { + "start": 2760.56, + "duration": 0.0, + "text": "this like 2/3 correction. Um<00:46:01.320> and<00:46:01.480> then<00:46:01.760> for<00:46:01.920> whatever<00:46:02.400> reason,<00:46:03.320> um<00:46:03.640> the" + }, + { + "start": 2763.79, + "duration": 0.0, + "text": "Um and then for whatever reason, um the" + }, + { + "start": 2763.8, + "duration": 0.0, + "text": "Um and then for whatever reason, um the Llama<00:46:04.200> 2<00:46:04.560> folks<00:46:05.400> decided,<00:46:06.160> well,<00:46:06.840> we<00:46:07.000> actually" + }, + { + "start": 2767.47, + "duration": 0.0, + "text": "Llama 2 folks decided, well, we actually" + }, + { + "start": 2767.48, + "duration": 0.0, + "text": "Llama 2 folks decided, well, we actually have<00:46:07.840> very<00:46:08.160> efficient<00:46:08.920> um<00:46:09.080> attention<00:46:09.640> heads" + }, + { + "start": 2770.03, + "duration": 0.0, + "text": "have very efficient um attention heads" + }, + { + "start": 2770.04, + "duration": 0.0, + "text": "have very efficient um attention heads with<00:46:10.240> like<00:46:11.000> um" + }, + { + "start": 2771.71, + "duration": 0.0, + "text": "with like um" + }, + { + "start": 2771.72, + "duration": 0.0, + "text": "with like um uh" + }, + { + "start": 2772.55, + "duration": 0.0, + "text": "uh" + }, + { + "start": 2772.56, + "duration": 0.0, + "text": "uh uh<00:46:13.080> MQ<00:46:13.480> A,<00:46:13.560> which<00:46:13.680> I'll<00:46:13.760> talk<00:46:13.920> about<00:46:14.160> later.<00:46:14.960> Um" + }, + { + "start": 2775.67, + "duration": 0.0, + "text": "uh MQ A, which I'll talk about later. Um" + }, + { + "start": 2775.68, + "duration": 0.0, + "text": "uh MQ A, which I'll talk about later. Um and<00:46:15.800> because<00:46:16.120> of<00:46:16.200> that,<00:46:16.880> we<00:46:17.040> can<00:46:17.200> multiply" + }, + { + "start": 2777.67, + "duration": 0.0, + "text": "and because of that, we can multiply" + }, + { + "start": 2777.68, + "duration": 0.0, + "text": "and because of that, we can multiply this<00:46:17.880> ratio<00:46:18.240> by<00:46:18.359> an<00:46:18.480> arbitrary<00:46:18.960> 1.33<00:46:20.080> and" + }, + { + "start": 2780.19, + "duration": 0.0, + "text": "this ratio by an arbitrary 1.33 and" + }, + { + "start": 2780.2, + "duration": 0.0, + "text": "this ratio by an arbitrary 1.33 and we'll<00:46:20.320> get<00:46:20.480> roughly<00:46:20.840> 3.5.<00:46:21.640> And<00:46:21.760> so<00:46:21.840> the<00:46:21.960> Llama" + }, + { + "start": 2782.23, + "duration": 0.0, + "text": "we'll get roughly 3.5. And so the Llama" + }, + { + "start": 2782.24, + "duration": 0.0, + "text": "we'll get roughly 3.5. And so the Llama people<00:46:22.560> kind<00:46:22.680> of<00:46:22.800> like<00:46:22.960> arbitrarily<00:46:23.520> chose<00:46:23.720> a" + }, + { + "start": 2783.75, + "duration": 0.0, + "text": "people kind of like arbitrarily chose a" + }, + { + "start": 2783.76, + "duration": 0.0, + "text": "people kind of like arbitrarily chose a slightly<00:46:24.080> different<00:46:24.440> ratio,<00:46:24.800> which" + }, + { + "start": 2785.31, + "duration": 0.0, + "text": "slightly different ratio, which" + }, + { + "start": 2785.32, + "duration": 0.0, + "text": "slightly different ratio, which essentially<00:46:25.800> emphasizes<00:46:26.480> the<00:46:26.600> MLPs<00:46:27.000> a<00:46:27.080> little" + }, + { + "start": 2787.31, + "duration": 0.0, + "text": "essentially emphasizes the MLPs a little" + }, + { + "start": 2787.32, + "duration": 0.0, + "text": "essentially emphasizes the MLPs a little bit<00:46:27.520> more.<00:46:28.280> Um<00:46:28.720> but<00:46:28.960> really<00:46:29.280> if<00:46:29.440> you<00:46:29.560> actually" + }, + { + "start": 2789.83, + "duration": 0.0, + "text": "bit more. Um but really if you actually" + }, + { + "start": 2789.84, + "duration": 0.0, + "text": "bit more. Um but really if you actually look<00:46:30.040> through<00:46:30.280> all<00:46:30.440> of<00:46:30.560> the<00:46:30.920> the<00:46:31.000> papers," + }, + { + "start": 2791.39, + "duration": 0.0, + "text": "look through all of the the papers," + }, + { + "start": 2791.4, + "duration": 0.0, + "text": "look through all of the the papers, you'll<00:46:31.520> find<00:46:32.080> you<00:46:32.120> know,<00:46:32.320> either<00:46:32.680> 2.6-ish<00:46:33.680> or" + }, + { + "start": 2793.79, + "duration": 0.0, + "text": "you'll find you know, either 2.6-ish or" + }, + { + "start": 2793.8, + "duration": 0.0, + "text": "you'll find you know, either 2.6-ish or 3.5<00:46:34.560> for<00:46:34.720> GLUs<00:46:35.720> um<00:46:36.120> or<00:46:36.320> four<00:46:37.000> if<00:46:37.160> you're<00:46:37.280> doing" + }, + { + "start": 2797.91, + "duration": 0.0, + "text": "3.5 for GLUs um or four if you're doing" + }, + { + "start": 2797.92, + "duration": 0.0, + "text": "3.5 for GLUs um or four if you're doing uh<00:46:38.000> non-GLU<00:46:39.480> models." + }, + { + "start": 2801.75, + "duration": 0.0, + "text": "uh non-GLU models." + }, + { + "start": 2801.76, + "duration": 0.0, + "text": "uh non-GLU models. Okay.<00:46:42.320> There's<00:46:42.520> another<00:46:42.840> exception,<00:46:43.280> which<00:46:43.400> I" + }, + { + "start": 2803.47, + "duration": 0.0, + "text": "Okay. There's another exception, which I" + }, + { + "start": 2803.48, + "duration": 0.0, + "text": "Okay. There's another exception, which I find<00:46:43.720> to<00:46:43.760> be<00:46:43.880> very<00:46:44.120> funny<00:46:44.400> but<00:46:44.560> also<00:46:44.880> very<00:46:45.160> very" + }, + { + "start": 2805.39, + "duration": 0.0, + "text": "find to be very funny but also very very" + }, + { + "start": 2805.4, + "duration": 0.0, + "text": "find to be very funny but also very very cool,<00:46:46.240> which<00:46:46.440> is<00:46:46.880> um" + }, + { + "start": 2807.51, + "duration": 0.0, + "text": "cool, which is um" + }, + { + "start": 2807.52, + "duration": 0.0, + "text": "cool, which is um you<00:46:47.600> know,<00:46:47.840> throughout<00:46:48.520> as<00:46:48.640> you<00:46:48.800> read<00:46:49.080> these" + }, + { + "start": 2809.23, + "duration": 0.0, + "text": "you know, throughout as you read these" + }, + { + "start": 2809.24, + "duration": 0.0, + "text": "you know, throughout as you read these like<00:46:49.359> technical<00:46:49.880> reports,<00:46:50.600> you'll<00:46:50.760> find<00:46:51.120> that" + }, + { + "start": 2811.47, + "duration": 0.0, + "text": "like technical reports, you'll find that" + }, + { + "start": 2811.48, + "duration": 0.0, + "text": "like technical reports, you'll find that most<00:46:51.800> people<00:46:52.120> are<00:46:52.240> just<00:46:52.520> very<00:46:52.880> boring<00:46:53.440> in" + }, + { + "start": 2813.51, + "duration": 0.0, + "text": "most people are just very boring in" + }, + { + "start": 2813.52, + "duration": 0.0, + "text": "most people are just very boring in their<00:46:53.680> choice<00:46:53.960> of<00:46:54.080> architectures.<00:46:54.680> They're" + }, + { + "start": 2814.79, + "duration": 0.0, + "text": "their choice of architectures. They're" + }, + { + "start": 2814.8, + "duration": 0.0, + "text": "their choice of architectures. They're like<00:46:55.000> we<00:46:55.120> did<00:46:55.320> Llama<00:46:55.760> but<00:46:55.880> we<00:46:55.960> changed<00:46:56.320> one" + }, + { + "start": 2816.51, + "duration": 0.0, + "text": "like we did Llama but we changed one" + }, + { + "start": 2816.52, + "duration": 0.0, + "text": "like we did Llama but we changed one thing.<00:46:57.240> Um<00:46:57.880> but<00:46:58.640> you<00:46:58.720> know,<00:46:58.840> folks<00:46:59.120> at<00:46:59.280> Google" + }, + { + "start": 2819.51, + "duration": 0.0, + "text": "thing. Um but you know, folks at Google" + }, + { + "start": 2819.52, + "duration": 0.0, + "text": "thing. Um but you know, folks at Google are<00:46:59.600> very<00:46:59.840> bold<00:47:00.200> sometimes<00:47:01.240> um<00:47:01.760> and<00:47:01.920> T5<00:47:02.480> is<00:47:02.560> one" + }, + { + "start": 2822.67, + "duration": 0.0, + "text": "are very bold sometimes um and T5 is one" + }, + { + "start": 2822.68, + "duration": 0.0, + "text": "are very bold sometimes um and T5 is one of<00:47:02.760> my<00:47:02.840> favorite<00:47:03.280> ones<00:47:03.880> because<00:47:04.120> they<00:47:04.200> have" + }, + { + "start": 2824.31, + "duration": 0.0, + "text": "of my favorite ones because they have" + }, + { + "start": 2824.32, + "duration": 0.0, + "text": "of my favorite ones because they have some<00:47:04.560> really<00:47:04.920> bold<00:47:05.240> settings.<00:47:06.240> Uh<00:47:06.600> they" + }, + { + "start": 2826.79, + "duration": 0.0, + "text": "some really bold settings. Uh they" + }, + { + "start": 2826.8, + "duration": 0.0, + "text": "some really bold settings. Uh they decided<00:47:07.400> that<00:47:08.000> um<00:47:08.120> instead<00:47:08.359> of<00:47:08.440> following" + }, + { + "start": 2828.87, + "duration": 0.0, + "text": "decided that um instead of following" + }, + { + "start": 2828.88, + "duration": 0.0, + "text": "decided that um instead of following this<00:47:09.040> like<00:47:09.280> 4x<00:47:09.880> rule<00:47:10.080> of<00:47:10.200> thumb,<00:47:10.840> they<00:47:10.960> decided" + }, + { + "start": 2831.47, + "duration": 0.0, + "text": "this like 4x rule of thumb, they decided" + }, + { + "start": 2831.48, + "duration": 0.0, + "text": "this like 4x rule of thumb, they decided that<00:47:11.600> they<00:47:11.760> want<00:47:11.920> to<00:47:11.960> have<00:47:12.080> a<00:47:12.120> 64x" + }, + { + "start": 2833.83, + "duration": 0.0, + "text": "that they want to have a 64x" + }, + { + "start": 2833.84, + "duration": 0.0, + "text": "that they want to have a 64x multiplier,<00:47:14.480> which<00:47:14.640> is<00:47:14.760> like<00:47:15.000> way<00:47:15.240> bigger" + }, + { + "start": 2835.83, + "duration": 0.0, + "text": "multiplier, which is like way bigger" + }, + { + "start": 2835.84, + "duration": 0.0, + "text": "multiplier, which is like way bigger than<00:47:16.080> four.<00:47:17.000> Um<00:47:17.520> and<00:47:17.640> they<00:47:17.760> have<00:47:17.920> a<00:47:18.000> reasonable" + }, + { + "start": 2838.51, + "duration": 0.0, + "text": "than four. Um and they have a reasonable" + }, + { + "start": 2838.52, + "duration": 0.0, + "text": "than four. Um and they have a reasonable argument<00:47:18.840> for<00:47:18.960> this<00:47:19.080> as<00:47:19.200> well.<00:47:19.359> This<00:47:19.480> is" + }, + { + "start": 2839.55, + "duration": 0.0, + "text": "argument for this as well. This is" + }, + { + "start": 2839.56, + "duration": 0.0, + "text": "argument for this as well. This is another<00:47:19.960> like<00:47:20.120> systems-based<00:47:20.920> argument," + }, + { + "start": 2841.39, + "duration": 0.0, + "text": "another like systems-based argument," + }, + { + "start": 2841.4, + "duration": 0.0, + "text": "another like systems-based argument, right?<00:47:21.640> They<00:47:21.760> said,<00:47:22.120> well,<00:47:22.840> you<00:47:22.960> know,<00:47:23.160> if<00:47:23.400> the" + }, + { + "start": 2843.51, + "duration": 0.0, + "text": "right? They said, well, you know, if the" + }, + { + "start": 2843.52, + "duration": 0.0, + "text": "right? They said, well, you know, if the bigger<00:47:23.840> my<00:47:24.000> matrix<00:47:24.359> multiplies,<00:47:24.880> the<00:47:25.000> more" + }, + { + "start": 2845.19, + "duration": 0.0, + "text": "bigger my matrix multiplies, the more" + }, + { + "start": 2845.2, + "duration": 0.0, + "text": "bigger my matrix multiplies, the more efficient<00:47:26.000> I<00:47:26.080> can<00:47:26.240> keep<00:47:26.440> my<00:47:26.560> hardware.<00:47:27.440> So<00:47:27.640> if" + }, + { + "start": 2847.79, + "duration": 0.0, + "text": "efficient I can keep my hardware. So if" + }, + { + "start": 2847.8, + "duration": 0.0, + "text": "efficient I can keep my hardware. So if I<00:47:27.880> make<00:47:28.240> this,<00:47:28.880> you<00:47:28.960> know,<00:47:29.080> multiplier<00:47:29.680> really" + }, + { + "start": 2849.95, + "duration": 0.0, + "text": "I make this, you know, multiplier really" + }, + { + "start": 2849.96, + "duration": 0.0, + "text": "I make this, you know, multiplier really big,<00:47:30.880> then<00:47:31.560> you<00:47:31.640> know,<00:47:31.760> my<00:47:31.920> matrix<00:47:32.320> multiplies" + }, + { + "start": 2853.27, + "duration": 0.0, + "text": "big, then you know, my matrix multiplies" + }, + { + "start": 2853.28, + "duration": 0.0, + "text": "big, then you know, my matrix multiplies can<00:47:33.680> potentially<00:47:34.720> be<00:47:34.960> sort<00:47:35.120> of<00:47:35.280> more" + }, + { + "start": 2855.59, + "duration": 0.0, + "text": "can potentially be sort of more" + }, + { + "start": 2855.6, + "duration": 0.0, + "text": "can potentially be sort of more efficiently<00:47:36.120> utilized,<00:47:36.760> right?" + }, + { + "start": 2857.95, + "duration": 0.0, + "text": "efficiently utilized, right?" + }, + { + "start": 2857.96, + "duration": 0.0, + "text": "efficiently utilized, right? Um<00:47:38.880> and<00:47:39.359> some<00:47:39.600> others<00:47:39.960> like<00:47:40.160> Gemma<00:47:40.440> 2<00:47:40.720> have" + }, + { + "start": 2860.87, + "duration": 0.0, + "text": "Um and some others like Gemma 2 have" + }, + { + "start": 2860.88, + "duration": 0.0, + "text": "Um and some others like Gemma 2 have also<00:47:41.160> tried<00:47:41.400> to<00:47:41.480> really<00:47:42.120> push<00:47:42.359> a<00:47:42.400> little<00:47:42.640> bit" + }, + { + "start": 2862.79, + "duration": 0.0, + "text": "also tried to really push a little bit" + }, + { + "start": 2862.8, + "duration": 0.0, + "text": "also tried to really push a little bit higher<00:47:43.600> on<00:47:43.800> this.<00:47:44.520> But<00:47:44.680> really<00:47:45.080> uh<00:47:45.240> T5<00:47:45.840> is<00:47:45.960> an" + }, + { + "start": 2866.11, + "duration": 0.0, + "text": "higher on this. But really uh T5 is an" + }, + { + "start": 2866.12, + "duration": 0.0, + "text": "higher on this. But really uh T5 is an kind<00:47:46.320> of<00:47:46.400> astounding<00:47:47.000> exception<00:47:47.520> at<00:47:47.680> 64.<00:47:48.280> I" + }, + { + "start": 2868.31, + "duration": 0.0, + "text": "kind of astounding exception at 64. I" + }, + { + "start": 2868.32, + "duration": 0.0, + "text": "kind of astounding exception at 64. I don't<00:47:48.480> think<00:47:48.920> any<00:47:49.200> other<00:47:49.400> model<00:47:49.680> has<00:47:49.840> really" + }, + { + "start": 2870.07, + "duration": 0.0, + "text": "don't think any other model has really" + }, + { + "start": 2870.08, + "duration": 0.0, + "text": "don't think any other model has really gone<00:47:50.320> that<00:47:50.520> high<00:47:51.120> in<00:47:51.240> the<00:47:51.359> feed<00:47:51.600> forward" + }, + { + "start": 2871.83, + "duration": 0.0, + "text": "gone that high in the feed forward" + }, + { + "start": 2871.84, + "duration": 0.0, + "text": "gone that high in the feed forward multiplier." + }, + { + "start": 2873.19, + "duration": 0.0, + "text": "multiplier." + }, + { + "start": 2873.2, + "duration": 0.0, + "text": "multiplier. Um" + }, + { + "start": 2874.87, + "duration": 0.0, + "text": "Um" + }, + { + "start": 2874.88, + "duration": 0.0, + "text": "Um and<00:47:55.800> empirically,<00:47:56.600> if<00:47:56.760> you<00:47:56.880> look<00:47:57.080> at<00:47:57.320> other" + }, + { + "start": 2877.59, + "duration": 0.0, + "text": "and empirically, if you look at other" + }, + { + "start": 2877.6, + "duration": 0.0, + "text": "and empirically, if you look at other sort<00:47:57.800> of<00:47:57.920> works<00:47:58.359> that<00:47:58.560> try<00:47:58.680> to<00:47:58.800> do<00:47:59.000> more" + }, + { + "start": 2879.19, + "duration": 0.0, + "text": "sort of works that try to do more" + }, + { + "start": 2879.2, + "duration": 0.0, + "text": "sort of works that try to do more controlled<00:47:59.760> comparisons<00:48:00.400> of<00:48:00.520> this<00:48:00.720> ratio,<00:48:01.480> um" + }, + { + "start": 2881.55, + "duration": 0.0, + "text": "controlled comparisons of this ratio, um" + }, + { + "start": 2881.56, + "duration": 0.0, + "text": "controlled comparisons of this ratio, um I've<00:48:01.720> taken<00:48:02.000> this<00:48:02.160> one<00:48:02.280> from<00:48:02.440> Kaplan<00:48:02.840> in<00:48:02.960> 2020." + }, + { + "start": 2883.47, + "duration": 0.0, + "text": "I've taken this one from Kaplan in 2020." + }, + { + "start": 2883.48, + "duration": 0.0, + "text": "I've taken this one from Kaplan in 2020. This<00:48:03.640> is<00:48:03.760> the<00:48:04.320> classic<00:48:05.240> uh<00:48:05.359> neural<00:48:05.680> scaling" + }, + { + "start": 2886.11, + "duration": 0.0, + "text": "This is the classic uh neural scaling" + }, + { + "start": 2886.12, + "duration": 0.0, + "text": "This is the classic uh neural scaling laws<00:48:06.359> paper<00:48:07.320> um<00:48:08.120> where<00:48:08.280> they<00:48:08.560> they<00:48:08.680> do<00:48:08.840> sort<00:48:09.040> of" + }, + { + "start": 2889.11, + "duration": 0.0, + "text": "laws paper um where they they do sort of" + }, + { + "start": 2889.12, + "duration": 0.0, + "text": "laws paper um where they they do sort of various<00:48:09.560> controlled<00:48:10.520> uh<00:48:10.720> studies<00:48:11.160> on" + }, + { + "start": 2891.27, + "duration": 0.0, + "text": "various controlled uh studies on" + }, + { + "start": 2891.28, + "duration": 0.0, + "text": "various controlled uh studies on language<00:48:11.600> models.<00:48:12.400> You'll<00:48:12.560> see,<00:48:12.960> you<00:48:13.040> know," + }, + { + "start": 2893.11, + "duration": 0.0, + "text": "language models. You'll see, you know," + }, + { + "start": 2893.12, + "duration": 0.0, + "text": "language models. You'll see, you know, this<00:48:13.280> wasn't<00:48:13.520> the<00:48:13.600> point<00:48:13.880> of<00:48:13.960> the<00:48:14.040> study," + }, + { + "start": 2894.55, + "duration": 0.0, + "text": "this wasn't the point of the study," + }, + { + "start": 2894.56, + "duration": 0.0, + "text": "this wasn't the point of the study, right?<00:48:14.880> It<00:48:14.960> was<00:48:15.080> a<00:48:15.120> scaling<00:48:15.520> laws<00:48:15.720> study.<00:48:15.880> But" + }, + { + "start": 2895.99, + "duration": 0.0, + "text": "right? It was a scaling laws study. But" + }, + { + "start": 2896.0, + "duration": 0.0, + "text": "right? It was a scaling laws study. But you'll<00:48:16.120> see<00:48:16.400> in<00:48:16.560> one<00:48:16.680> of<00:48:16.760> the<00:48:16.840> panels<00:48:17.560> that" + }, + { + "start": 2897.71, + "duration": 0.0, + "text": "you'll see in one of the panels that" + }, + { + "start": 2897.72, + "duration": 0.0, + "text": "you'll see in one of the panels that they<00:48:17.840> actually<00:48:18.520> have<00:48:18.800> a<00:48:19.160> sort<00:48:19.320> of<00:48:19.400> ablation<00:48:19.880> or" + }, + { + "start": 2899.99, + "duration": 0.0, + "text": "they actually have a sort of ablation or" + }, + { + "start": 2900.0, + "duration": 0.0, + "text": "they actually have a sort of ablation or sweep<00:48:20.560> where<00:48:20.680> they<00:48:20.800> change<00:48:21.160> the<00:48:21.280> feed<00:48:21.560> forward" + }, + { + "start": 2901.83, + "duration": 0.0, + "text": "sweep where they change the feed forward" + }, + { + "start": 2901.84, + "duration": 0.0, + "text": "sweep where they change the feed forward ratio<00:48:22.320> and<00:48:22.400> they<00:48:22.480> look<00:48:22.640> at<00:48:22.720> the<00:48:22.840> loss,<00:48:23.359> right?" + }, + { + "start": 2904.11, + "duration": 0.0, + "text": "ratio and they look at the loss, right?" + }, + { + "start": 2904.12, + "duration": 0.0, + "text": "ratio and they look at the loss, right? Um<00:48:24.359> for<00:48:24.480> a<00:48:24.520> very<00:48:24.720> small<00:48:24.960> model<00:48:25.200> here,<00:48:25.359> right?" + }, + { + "start": 2906.43, + "duration": 0.0, + "text": "Um for a very small model here, right?" + }, + { + "start": 2906.44, + "duration": 0.0, + "text": "Um for a very small model here, right? But<00:48:26.560> what<00:48:26.720> they<00:48:26.920> what<00:48:27.120> they<00:48:27.240> find<00:48:27.720> in<00:48:27.840> this" + }, + { + "start": 2907.99, + "duration": 0.0, + "text": "But what they what they find in this" + }, + { + "start": 2908.0, + "duration": 0.0, + "text": "But what they what they find in this paper<00:48:28.800> is<00:48:29.440> there's<00:48:29.600> a<00:48:29.680> basin<00:48:30.440> where<00:48:30.600> you<00:48:30.720> start" + }, + { + "start": 2910.95, + "duration": 0.0, + "text": "paper is there's a basin where you start" + }, + { + "start": 2910.96, + "duration": 0.0, + "text": "paper is there's a basin where you start at<00:48:31.040> about<00:48:31.400> one<00:48:31.760> and<00:48:31.840> you<00:48:31.920> end<00:48:32.080> up<00:48:32.160> about<00:48:32.400> maybe" + }, + { + "start": 2912.59, + "duration": 0.0, + "text": "at about one and you end up about maybe" + }, + { + "start": 2912.6, + "duration": 0.0, + "text": "at about one and you end up about maybe 10<00:48:33.480> where<00:48:33.680> this<00:48:33.840> hyperparameter<00:48:34.520> is<00:48:34.600> like" + }, + { + "start": 2914.79, + "duration": 0.0, + "text": "10 where this hyperparameter is like" + }, + { + "start": 2914.8, + "duration": 0.0, + "text": "10 where this hyperparameter is like pretty<00:48:35.200> good<00:48:35.480> and<00:48:35.640> very<00:48:35.960> very<00:48:36.240> flat.<00:48:36.640> You<00:48:36.760> lose" + }, + { + "start": 2917.03, + "duration": 0.0, + "text": "pretty good and very very flat. You lose" + }, + { + "start": 2917.04, + "duration": 0.0, + "text": "pretty good and very very flat. You lose very<00:48:37.520> little<00:48:38.160> relative<00:48:38.600> to<00:48:38.680> the<00:48:38.840> optimal<00:48:39.359> loss" + }, + { + "start": 2919.95, + "duration": 0.0, + "text": "very little relative to the optimal loss" + }, + { + "start": 2919.96, + "duration": 0.0, + "text": "very little relative to the optimal loss down<00:48:40.200> here,<00:48:40.600> right?<00:48:41.400> Um<00:48:41.680> and<00:48:41.800> then<00:48:41.920> if<00:48:42.040> you<00:48:42.160> get" + }, + { + "start": 2922.31, + "duration": 0.0, + "text": "down here, right? Um and then if you get" + }, + { + "start": 2922.32, + "duration": 0.0, + "text": "down here, right? Um and then if you get it<00:48:42.440> really<00:48:42.800> wrong,<00:48:43.120> like<00:48:43.280> you<00:48:43.359> get,<00:48:43.640> you<00:48:43.720> know," + }, + { + "start": 2923.83, + "duration": 0.0, + "text": "it really wrong, like you get, you know," + }, + { + "start": 2923.84, + "duration": 0.0, + "text": "it really wrong, like you get, you know, above<00:48:44.120> 10<00:48:44.400> to<00:48:44.520> 100<00:48:45.000> or<00:48:45.080> something<00:48:45.400> like<00:48:45.560> that," + }, + { + "start": 2926.19, + "duration": 0.0, + "text": "above 10 to 100 or something like that," + }, + { + "start": 2926.2, + "duration": 0.0, + "text": "above 10 to 100 or something like that, you<00:48:46.320> know,<00:48:46.400> then<00:48:46.600> your<00:48:46.720> loss<00:48:46.960> starts<00:48:47.160> really" + }, + { + "start": 2927.349, + "duration": 0.0, + "text": "you know, then your loss starts really" + }, + { + "start": 2927.359, + "duration": 0.0, + "text": "you know, then your loss starts really shooting<00:48:47.720> up<00:48:47.840> quadratically." + }, + { + "start": 2929.15, + "duration": 0.0, + "text": "shooting up quadratically." + }, + { + "start": 2929.16, + "duration": 0.0, + "text": "shooting up quadratically. Um<00:48:49.560> and<00:48:49.680> so<00:48:49.800> a<00:48:49.840> lot<00:48:50.080> of<00:48:50.160> these<00:48:50.359> choices<00:48:50.880> that" + }, + { + "start": 2931.11, + "duration": 0.0, + "text": "Um and so a lot of these choices that" + }, + { + "start": 2931.12, + "duration": 0.0, + "text": "Um and so a lot of these choices that range<00:48:51.400> between<00:48:51.760> like<00:48:52.000> 2.6<00:48:52.720> to<00:48:52.840> four,<00:48:53.440> they're" + }, + { + "start": 2933.59, + "duration": 0.0, + "text": "range between like 2.6 to four, they're" + }, + { + "start": 2933.6, + "duration": 0.0, + "text": "range between like 2.6 to four, they're all<00:48:53.800> kind<00:48:53.960> of<00:48:54.080> falling<00:48:54.560> into<00:48:54.760> this<00:48:55.200> relatively" + }, + { + "start": 2935.71, + "duration": 0.0, + "text": "all kind of falling into this relatively" + }, + { + "start": 2935.72, + "duration": 0.0, + "text": "all kind of falling into this relatively nice<00:48:55.960> basin.<00:48:56.680> So<00:48:56.800> you're<00:48:56.960> fine<00:48:57.320> choosing" + }, + { + "start": 2937.67, + "duration": 0.0, + "text": "nice basin. So you're fine choosing" + }, + { + "start": 2937.68, + "duration": 0.0, + "text": "nice basin. So you're fine choosing those<00:48:57.880> numbers,<00:48:58.520> right?" + }, + { + "start": 2939.87, + "duration": 0.0, + "text": "those numbers, right?" + }, + { + "start": 2939.88, + "duration": 0.0, + "text": "those numbers, right? Okay.<00:49:00.680> So<00:49:00.960> what<00:49:01.160> can<00:49:01.280> we<00:49:01.400> learn<00:49:01.600> about<00:49:01.840> this" + }, + { + "start": 2942.03, + "duration": 0.0, + "text": "Okay. So what can we learn about this" + }, + { + "start": 2942.04, + "duration": 0.0, + "text": "Okay. So what can we learn about this hyperparameter?<00:49:03.120> Well,<00:49:03.280> the<00:49:03.359> default" + }, + { + "start": 2943.75, + "duration": 0.0, + "text": "hyperparameter? Well, the default" + }, + { + "start": 2943.76, + "duration": 0.0, + "text": "hyperparameter? Well, the default choices<00:49:04.120> have<00:49:04.280> worked<00:49:04.600> very<00:49:04.960> well<00:49:05.240> for<00:49:05.400> nearly" + }, + { + "start": 2945.83, + "duration": 0.0, + "text": "choices have worked very well for nearly" + }, + { + "start": 2945.84, + "duration": 0.0, + "text": "choices have worked very well for nearly all<00:49:06.080> modern<00:49:06.480> language<00:49:06.840> models.<00:49:07.120> So<00:49:07.200> you<00:49:07.280> can" + }, + { + "start": 2947.43, + "duration": 0.0, + "text": "all modern language models. So you can" + }, + { + "start": 2947.44, + "duration": 0.0, + "text": "all modern language models. So you can safely<00:49:07.840> choose<00:49:08.160> that.<00:49:08.880> Um<00:49:09.280> T5<00:49:09.800> was<00:49:09.920> a<00:49:09.960> fine" + }, + { + "start": 2950.27, + "duration": 0.0, + "text": "safely choose that. Um T5 was a fine" + }, + { + "start": 2950.28, + "duration": 0.0, + "text": "safely choose that. Um T5 was a fine model<00:49:10.680> or<00:49:10.760> the<00:49:10.880> Virgin<00:49:11.240> 1<00:49:11.400> T5<00:49:11.880> was<00:49:12.000> a<00:49:12.040> fine" + }, + { + "start": 2952.31, + "duration": 0.0, + "text": "model or the Virgin 1 T5 was a fine" + }, + { + "start": 2952.32, + "duration": 0.0, + "text": "model or the Virgin 1 T5 was a fine model,<00:49:12.680> right?<00:49:12.800> Like<00:49:12.960> it<00:49:13.040> wasn't<00:49:13.200> a<00:49:13.240> bad" + }, + { + "start": 2953.55, + "duration": 0.0, + "text": "model, right? Like it wasn't a bad" + }, + { + "start": 2953.56, + "duration": 0.0, + "text": "model, right? Like it wasn't a bad model.<00:49:14.440> Um<00:49:14.520> and<00:49:14.640> so<00:49:14.760> even<00:49:15.040> radical<00:49:15.440> choices" + }, + { + "start": 2955.79, + "duration": 0.0, + "text": "model. Um and so even radical choices" + }, + { + "start": 2955.8, + "duration": 0.0, + "text": "model. Um and so even radical choices can<00:49:16.200> technically<00:49:16.760> work,<00:49:17.480> but<00:49:17.640> it's<00:49:17.840> probably" + }, + { + "start": 2958.47, + "duration": 0.0, + "text": "can technically work, but it's probably" + }, + { + "start": 2958.48, + "duration": 0.0, + "text": "can technically work, but it's probably going<00:49:18.640> to<00:49:18.720> be<00:49:18.840> compute<00:49:19.280> inefficient.<00:49:20.160> And<00:49:20.280> I" + }, + { + "start": 2960.31, + "duration": 0.0, + "text": "going to be compute inefficient. And I" + }, + { + "start": 2960.32, + "duration": 0.0, + "text": "going to be compute inefficient. And I think<00:49:20.520> the<00:49:20.640> funniest<00:49:21.120> part<00:49:21.320> of<00:49:21.359> the<00:49:21.480> saga<00:49:21.880> of" + }, + { + "start": 2961.95, + "duration": 0.0, + "text": "think the funniest part of the saga of" + }, + { + "start": 2961.96, + "duration": 0.0, + "text": "think the funniest part of the saga of this<00:49:22.160> kind<00:49:22.320> of<00:49:22.400> the<00:49:22.480> punchline<00:49:23.000> of<00:49:23.080> the<00:49:23.200> T5" + }, + { + "start": 2963.55, + "duration": 0.0, + "text": "this kind of the punchline of the T5" + }, + { + "start": 2963.56, + "duration": 0.0, + "text": "this kind of the punchline of the T5 saga<00:49:23.840> to<00:49:23.960> me<00:49:24.560> is<00:49:24.680> that<00:49:24.800> they<00:49:24.880> have<00:49:25.000> a<00:49:25.040> follow-up" + }, + { + "start": 2965.55, + "duration": 0.0, + "text": "saga to me is that they have a follow-up" + }, + { + "start": 2965.56, + "duration": 0.0, + "text": "saga to me is that they have a follow-up model<00:49:25.800> T5<00:49:26.240> 1.v<00:49:27.040> v1.1<00:49:28.480> um<00:49:28.800> that's<00:49:29.040> like" + }, + { + "start": 2969.19, + "duration": 0.0, + "text": "model T5 1.v v1.1 um that's like" + }, + { + "start": 2969.2, + "duration": 0.0, + "text": "model T5 1.v v1.1 um that's like supposed<00:49:29.480> to<00:49:29.560> be<00:49:29.600> the<00:49:29.680> improved<00:49:30.080> version<00:49:30.320> of" + }, + { + "start": 2970.39, + "duration": 0.0, + "text": "supposed to be the improved version of" + }, + { + "start": 2970.4, + "duration": 0.0, + "text": "supposed to be the improved version of T5<00:49:31.359> and<00:49:31.520> they<00:49:31.600> kind<00:49:31.800> of<00:49:32.120> go<00:49:32.280> back<00:49:32.600> to<00:49:32.680> the" + }, + { + "start": 2972.75, + "duration": 0.0, + "text": "T5 and they kind of go back to the" + }, + { + "start": 2972.76, + "duration": 0.0, + "text": "T5 and they kind of go back to the standard<00:49:33.120> 2.5<00:49:33.720> multiplier,<00:49:34.400> you<00:49:34.520> know?<00:49:34.640> So" + }, + { + "start": 2974.87, + "duration": 0.0, + "text": "standard 2.5 multiplier, you know? So" + }, + { + "start": 2974.88, + "duration": 0.0, + "text": "standard 2.5 multiplier, you know? So there's<00:49:35.080> nothing<00:49:35.359> explicitly<00:49:35.880> stated<00:49:36.280> here," + }, + { + "start": 2976.75, + "duration": 0.0, + "text": "there's nothing explicitly stated here," + }, + { + "start": 2976.76, + "duration": 0.0, + "text": "there's nothing explicitly stated here, but<00:49:36.880> clearly,<00:49:37.600> you<00:49:37.760> know,<00:49:37.880> when<00:49:38.000> they<00:49:38.080> tried" + }, + { + "start": 2978.31, + "duration": 0.0, + "text": "but clearly, you know, when they tried" + }, + { + "start": 2978.32, + "duration": 0.0, + "text": "but clearly, you know, when they tried to<00:49:38.560> to<00:49:38.680> update<00:49:39.120> T5,<00:49:39.640> they<00:49:39.720> decided<00:49:40.240> that<00:49:40.360> they" + }, + { + "start": 2980.43, + "duration": 0.0, + "text": "to to update T5, they decided that they" + }, + { + "start": 2980.44, + "duration": 0.0, + "text": "to to update T5, they decided that they wanted<00:49:40.680> to<00:49:40.760> go<00:49:40.880> back<00:49:41.120> to<00:49:41.240> a<00:49:41.280> more<00:49:41.480> standard" + }, + { + "start": 2981.95, + "duration": 0.0, + "text": "wanted to go back to a more standard" + }, + { + "start": 2981.96, + "duration": 0.0, + "text": "wanted to go back to a more standard multiplier,<00:49:42.480> which<00:49:42.640> I<00:49:42.680> find<00:49:42.920> to<00:49:43.000> be<00:49:43.120> a<00:49:43.160> little" + }, + { + "start": 2983.35, + "duration": 0.0, + "text": "multiplier, which I find to be a little" + }, + { + "start": 2983.36, + "duration": 0.0, + "text": "multiplier, which I find to be a little bit<00:49:43.520> funny." + }, + { + "start": 2985.07, + "duration": 0.0, + "text": "bit funny." + }, + { + "start": 2985.08, + "duration": 0.0, + "text": "bit funny. Okay.<00:49:45.880> So,<00:49:46.160> that's<00:49:46.560> the,<00:49:47.080> you<00:49:47.160> know," + }, + { + "start": 2987.31, + "duration": 0.0, + "text": "Okay. So, that's the, you know," + }, + { + "start": 2987.32, + "duration": 0.0, + "text": "Okay. So, that's the, you know, feed-forward<00:49:47.960> ratio," + }, + { + "start": 2989.07, + "duration": 0.0, + "text": "feed-forward ratio," + }, + { + "start": 2989.08, + "duration": 0.0, + "text": "feed-forward ratio, um<00:49:49.400> which<00:49:49.600> now<00:49:49.840> you<00:49:49.960> have<00:49:50.120> like<00:49:50.280> a<00:49:50.360> rough<00:49:50.680> sense" + }, + { + "start": 2990.99, + "duration": 0.0, + "text": "um which now you have like a rough sense" + }, + { + "start": 2991.0, + "duration": 0.0, + "text": "um which now you have like a rough sense of<00:49:51.080> like<00:49:51.200> what<00:49:51.320> the<00:49:51.440> right<00:49:51.680> order<00:49:51.880> of" + }, + { + "start": 2991.99, + "duration": 0.0, + "text": "of like what the right order of" + }, + { + "start": 2992.0, + "duration": 0.0, + "text": "of like what the right order of magnitude<00:49:52.480> is." + }, + { + "start": 2993.51, + "duration": 0.0, + "text": "magnitude is." + }, + { + "start": 2993.52, + "duration": 0.0, + "text": "magnitude is. Now,<00:49:53.840> let's<00:49:54.040> talk<00:49:54.240> about<00:49:54.480> a<00:49:54.520> different" + }, + { + "start": 2995.03, + "duration": 0.0, + "text": "Now, let's talk about a different" + }, + { + "start": 2995.04, + "duration": 0.0, + "text": "Now, let's talk about a different consensus<00:49:55.560> hyperparameter.<00:49:56.640> Um<00:49:57.160> I<00:49:57.240> always" + }, + { + "start": 2997.51, + "duration": 0.0, + "text": "consensus hyperparameter. Um I always" + }, + { + "start": 2997.52, + "duration": 0.0, + "text": "consensus hyperparameter. Um I always found<00:49:57.760> this<00:49:57.880> to<00:49:57.960> be<00:49:58.040> very<00:49:58.320> strange<00:49:59.160> when<00:49:59.440> sort" + }, + { + "start": 2999.59, + "duration": 0.0, + "text": "found this to be very strange when sort" + }, + { + "start": 2999.6, + "duration": 0.0, + "text": "found this to be very strange when sort of<00:49:59.720> teaching<00:50:00.880> uh<00:50:01.040> 224N<00:50:01.840> and,<00:50:02.040> you<00:50:02.120> know,<00:50:02.760> just" + }, + { + "start": 3002.91, + "duration": 0.0, + "text": "of teaching uh 224N and, you know, just" + }, + { + "start": 3002.92, + "duration": 0.0, + "text": "of teaching uh 224N and, you know, just sort<00:50:03.080> of<00:50:03.240> teaching<00:50:03.520> students<00:50:03.880> about<00:50:04.080> this," + }, + { + "start": 3004.87, + "duration": 0.0, + "text": "sort of teaching students about this," + }, + { + "start": 3004.88, + "duration": 0.0, + "text": "sort of teaching students about this, which<00:50:05.080> is,<00:50:05.280> if<00:50:05.440> you<00:50:05.560> have<00:50:05.680> a<00:50:05.720> multi-head" + }, + { + "start": 3006.31, + "duration": 0.0, + "text": "which is, if you have a multi-head" + }, + { + "start": 3006.32, + "duration": 0.0, + "text": "which is, if you have a multi-head attention,<00:50:06.880> where<00:50:06.960> you<00:50:07.040> have<00:50:07.200> multiple<00:50:07.720> heads" + }, + { + "start": 3008.11, + "duration": 0.0, + "text": "attention, where you have multiple heads" + }, + { + "start": 3008.12, + "duration": 0.0, + "text": "attention, where you have multiple heads for<00:50:08.280> your<00:50:08.400> attention<00:50:09.200> in<00:50:09.320> your<00:50:09.400> transformer," + }, + { + "start": 3010.75, + "duration": 0.0, + "text": "for your attention in your transformer," + }, + { + "start": 3010.76, + "duration": 0.0, + "text": "for your attention in your transformer, um<00:50:11.040> the<00:50:11.160> canonical<00:50:11.760> thing<00:50:11.960> to<00:50:12.080> do,<00:50:12.280> the<00:50:12.360> thing" + }, + { + "start": 3012.51, + "duration": 0.0, + "text": "um the canonical thing to do, the thing" + }, + { + "start": 3012.52, + "duration": 0.0, + "text": "um the canonical thing to do, the thing that<00:50:12.760> almost<00:50:13.120> everyone<00:50:13.600> does,<00:50:14.360> is<00:50:14.680> if<00:50:14.840> you" + }, + { + "start": 3014.91, + "duration": 0.0, + "text": "that almost everyone does, is if you" + }, + { + "start": 3014.92, + "duration": 0.0, + "text": "that almost everyone does, is if you have<00:50:15.120> multiple<00:50:15.560> heads,<00:50:16.240> you<00:50:16.360> make<00:50:16.640> sure<00:50:17.400> that" + }, + { + "start": 3017.75, + "duration": 0.0, + "text": "have multiple heads, you make sure that" + }, + { + "start": 3017.76, + "duration": 0.0, + "text": "have multiple heads, you make sure that the<00:50:17.880> size<00:50:18.320> of<00:50:18.440> those<00:50:18.680> heads,<00:50:18.960> the<00:50:19.040> head" + }, + { + "start": 3019.31, + "duration": 0.0, + "text": "the size of those heads, the head" + }, + { + "start": 3019.32, + "duration": 0.0, + "text": "the size of those heads, the head dimension,<00:50:20.280> is<00:50:20.440> such<00:50:20.680> that<00:50:21.240> you<00:50:21.520> sort<00:50:21.680> of<00:50:21.760> have" + }, + { + "start": 3021.87, + "duration": 0.0, + "text": "dimension, is such that you sort of have" + }, + { + "start": 3021.88, + "duration": 0.0, + "text": "dimension, is such that you sort of have the<00:50:22.000> same<00:50:22.200> dimension<00:50:22.600> as<00:50:22.720> a<00:50:22.800> single-head" + }, + { + "start": 3023.67, + "duration": 0.0, + "text": "the same dimension as a single-head" + }, + { + "start": 3023.68, + "duration": 0.0, + "text": "the same dimension as a single-head transformer,<00:50:24.240> right?<00:50:24.400> So,<00:50:24.480> you<00:50:24.640> always<00:50:25.000> make" + }, + { + "start": 3025.19, + "duration": 0.0, + "text": "transformer, right? So, you always make" + }, + { + "start": 3025.2, + "duration": 0.0, + "text": "transformer, right? So, you always make sure<00:50:25.960> that<00:50:26.600> you<00:50:26.880> sort<00:50:27.040> of<00:50:27.120> divide<00:50:28.160> the<00:50:28.280> hidden" + }, + { + "start": 3028.59, + "duration": 0.0, + "text": "sure that you sort of divide the hidden" + }, + { + "start": 3028.6, + "duration": 0.0, + "text": "sure that you sort of divide the hidden dimension<00:50:29.440> to<00:50:29.640> basically<00:50:30.120> multiply<00:50:30.800> with<00:50:31.000> H." + }, + { + "start": 3031.19, + "duration": 0.0, + "text": "dimension to basically multiply with H." + }, + { + "start": 3031.2, + "duration": 0.0, + "text": "dimension to basically multiply with H. So,<00:50:31.320> in<00:50:31.440> this<00:50:31.600> case,<00:50:31.840> right,<00:50:32.000> you<00:50:32.080> have<00:50:32.320> H,<00:50:32.600> the" + }, + { + "start": 3032.67, + "duration": 0.0, + "text": "So, in this case, right, you have H, the" + }, + { + "start": 3032.68, + "duration": 0.0, + "text": "So, in this case, right, you have H, the number<00:50:32.920> of<00:50:33.040> heads,<00:50:33.680> and<00:50:33.760> the<00:50:33.880> dimension<00:50:34.280> of" + }, + { + "start": 3034.39, + "duration": 0.0, + "text": "number of heads, and the dimension of" + }, + { + "start": 3034.4, + "duration": 0.0, + "text": "number of heads, and the dimension of each<00:50:34.560> head<00:50:34.720> is<00:50:34.960> D<00:50:35.120> over<00:50:35.400> H,<00:50:35.600> so<00:50:35.680> you<00:50:35.760> multiply" + }, + { + "start": 3036.19, + "duration": 0.0, + "text": "each head is D over H, so you multiply" + }, + { + "start": 3036.2, + "duration": 0.0, + "text": "each head is D over H, so you multiply the<00:50:36.320> two<00:50:36.440> and<00:50:36.560> you<00:50:36.640> get<00:50:36.840> D,<00:50:37.360> right?<00:50:37.480> For<00:50:37.600> some" + }, + { + "start": 3037.79, + "duration": 0.0, + "text": "the two and you get D, right? For some" + }, + { + "start": 3037.8, + "duration": 0.0, + "text": "the two and you get D, right? For some reason,<00:50:38.080> this<00:50:38.240> is<00:50:38.320> kind<00:50:38.480> of<00:50:38.560> the<00:50:38.640> rule<00:50:38.880> of" + }, + { + "start": 3038.91, + "duration": 0.0, + "text": "reason, this is kind of the rule of" + }, + { + "start": 3038.92, + "duration": 0.0, + "text": "reason, this is kind of the rule of thumb." + }, + { + "start": 3039.87, + "duration": 0.0, + "text": "thumb." + }, + { + "start": 3039.88, + "duration": 0.0, + "text": "thumb. Um<00:50:40.320> of<00:50:40.440> course,<00:50:40.600> this<00:50:40.720> doesn't<00:50:41.000> have<00:50:41.240> to<00:50:41.360> be" + }, + { + "start": 3041.43, + "duration": 0.0, + "text": "Um of course, this doesn't have to be" + }, + { + "start": 3041.44, + "duration": 0.0, + "text": "Um of course, this doesn't have to be true.<00:50:41.640> We<00:50:41.720> can<00:50:41.880> arbitrarily<00:50:42.600> change<00:50:42.960> the<00:50:43.200> the" + }, + { + "start": 3043.35, + "duration": 0.0, + "text": "true. We can arbitrarily change the the" + }, + { + "start": 3043.36, + "duration": 0.0, + "text": "true. We can arbitrarily change the the ratios<00:50:43.840> between<00:50:44.080> head<00:50:44.280> dimensions<00:50:44.760> and<00:50:44.840> model" + }, + { + "start": 3045.11, + "duration": 0.0, + "text": "ratios between head dimensions and model" + }, + { + "start": 3045.12, + "duration": 0.0, + "text": "ratios between head dimensions and model dimensions,<00:50:45.960> but<00:50:46.120> most<00:50:46.480> models<00:50:46.880> do<00:50:47.520> follow" + }, + { + "start": 3047.79, + "duration": 0.0, + "text": "dimensions, but most models do follow" + }, + { + "start": 3047.8, + "duration": 0.0, + "text": "dimensions, but most models do follow this<00:50:48.000> guideline,<00:50:48.480> and<00:50:48.560> it<00:50:48.600> turns<00:50:48.920> out<00:50:49.480> to<00:50:49.560> work" + }, + { + "start": 3049.75, + "duration": 0.0, + "text": "this guideline, and it turns out to work" + }, + { + "start": 3049.76, + "duration": 0.0, + "text": "this guideline, and it turns out to work pretty<00:50:49.960> well.<00:50:50.680> Um" + }, + { + "start": 3051.59, + "duration": 0.0, + "text": "pretty well. Um" + }, + { + "start": 3051.6, + "duration": 0.0, + "text": "pretty well. Um You<00:50:51.640> know,<00:50:51.680> we<00:50:51.760> can<00:50:51.880> look<00:50:52.040> at<00:50:52.160> a<00:50:52.200> variety<00:50:52.720> of" + }, + { + "start": 3052.79, + "duration": 0.0, + "text": "You know, we can look at a variety of" + }, + { + "start": 3052.8, + "duration": 0.0, + "text": "You know, we can look at a variety of different<00:50:53.040> models,<00:50:53.360> classic<00:50:53.840> and<00:50:53.960> new.<00:50:54.280> I," + }, + { + "start": 3054.47, + "duration": 0.0, + "text": "different models, classic and new. I," + }, + { + "start": 3054.48, + "duration": 0.0, + "text": "different models, classic and new. I, you<00:50:54.600> know,<00:50:54.720> have<00:50:55.320> the<00:50:55.440> latest<00:50:55.760> and<00:50:55.840> greatest" + }, + { + "start": 3056.19, + "duration": 0.0, + "text": "you know, have the latest and greatest" + }, + { + "start": 3056.2, + "duration": 0.0, + "text": "you know, have the latest and greatest quad<00:50:56.400> as<00:50:56.560> well,<00:50:57.160> and<00:50:57.280> you<00:50:57.360> kind<00:50:57.520> of<00:50:57.600> find," + }, + { + "start": 3057.91, + "duration": 0.0, + "text": "quad as well, and you kind of find," + }, + { + "start": 3057.92, + "duration": 0.0, + "text": "quad as well, and you kind of find, yeah,<00:50:58.120> the<00:50:58.240> ratios<00:50:58.760> are<00:50:58.880> roughly<00:50:59.360> around<00:50:59.760> one" + }, + { + "start": 3060.31, + "duration": 0.0, + "text": "yeah, the ratios are roughly around one" + }, + { + "start": 3060.32, + "duration": 0.0, + "text": "yeah, the ratios are roughly around one a<00:51:00.440> model<00:51:00.920> head.<00:51:01.680> Um<00:51:01.920> you<00:51:02.040> know,<00:51:02.240> notable" + }, + { + "start": 3062.67, + "duration": 0.0, + "text": "a model head. Um you know, notable" + }, + { + "start": 3062.68, + "duration": 0.0, + "text": "a model head. Um you know, notable exception<00:51:03.200> of<00:51:03.320> T5,<00:51:04.480> um<00:51:04.960> and<00:51:05.120> even<00:51:05.400> Lambda," + }, + { + "start": 3065.75, + "duration": 0.0, + "text": "exception of T5, um and even Lambda," + }, + { + "start": 3065.76, + "duration": 0.0, + "text": "exception of T5, um and even Lambda, which<00:51:05.920> is<00:51:06.040> another<00:51:06.320> Google<00:51:06.600> model,<00:51:07.440> um<00:51:07.840> but" + }, + { + "start": 3068.07, + "duration": 0.0, + "text": "which is another Google model, um but" + }, + { + "start": 3068.08, + "duration": 0.0, + "text": "which is another Google model, um but really<00:51:08.400> everyone<00:51:08.800> sticks<00:51:09.160> around<00:51:09.520> one.<00:51:09.760> And<00:51:09.840> I" + }, + { + "start": 3069.87, + "duration": 0.0, + "text": "really everyone sticks around one. And I" + }, + { + "start": 3069.88, + "duration": 0.0, + "text": "really everyone sticks around one. And I think<00:51:10.040> this<00:51:10.200> is" + }, + { + "start": 3070.99, + "duration": 0.0, + "text": "think this is" + }, + { + "start": 3071.0, + "duration": 0.0, + "text": "think this is kind<00:51:11.320> of<00:51:11.440> an<00:51:11.520> interesting<00:51:12.000> one." + }, + { + "start": 3073.11, + "duration": 0.0, + "text": "kind of an interesting one." + }, + { + "start": 3073.12, + "duration": 0.0, + "text": "kind of an interesting one. Um<00:51:13.760> I<00:51:13.840> think<00:51:14.200> the<00:51:14.320> thing<00:51:14.480> about<00:51:14.720> head" + }, + { + "start": 3074.91, + "duration": 0.0, + "text": "Um I think the thing about head" + }, + { + "start": 3074.92, + "duration": 0.0, + "text": "Um I think the thing about head dimensions<00:51:15.920> uh<00:51:16.080> that<00:51:16.240> I'll<00:51:16.480> that<00:51:16.640> I'll<00:51:16.760> end" + }, + { + "start": 3076.95, + "duration": 0.0, + "text": "dimensions uh that I'll that I'll end" + }, + { + "start": 3076.96, + "duration": 0.0, + "text": "dimensions uh that I'll that I'll end with<00:51:17.120> here<00:51:17.600> is<00:51:17.720> I<00:51:17.760> think<00:51:17.880> this<00:51:18.040> is<00:51:18.160> yet<00:51:18.400> another" + }, + { + "start": 3078.75, + "duration": 0.0, + "text": "with here is I think this is yet another" + }, + { + "start": 3078.76, + "duration": 0.0, + "text": "with here is I think this is yet another kind<00:51:18.920> of<00:51:19.040> forgiving<00:51:19.720> hyperparameter.<00:51:20.840> Um" + }, + { + "start": 3080.91, + "duration": 0.0, + "text": "kind of forgiving hyperparameter. Um" + }, + { + "start": 3080.92, + "duration": 0.0, + "text": "kind of forgiving hyperparameter. Um there's<00:51:21.080> a<00:51:21.160> couple<00:51:21.440> of<00:51:21.520> ablations<00:51:21.920> that" + }, + { + "start": 3082.03, + "duration": 0.0, + "text": "there's a couple of ablations that" + }, + { + "start": 3082.04, + "duration": 0.0, + "text": "there's a couple of ablations that people<00:51:22.280> have<00:51:22.400> done.<00:51:22.640> There's<00:51:23.200> once<00:51:23.440> again<00:51:23.600> a" + }, + { + "start": 3083.63, + "duration": 0.0, + "text": "people have done. There's once again a" + }, + { + "start": 3083.64, + "duration": 0.0, + "text": "people have done. There's once again a pretty<00:51:23.840> wide<00:51:24.240> basin<00:51:24.600> around<00:51:24.920> one<00:51:25.560> that<00:51:25.720> you" + }, + { + "start": 3085.79, + "duration": 0.0, + "text": "pretty wide basin around one that you" + }, + { + "start": 3085.8, + "duration": 0.0, + "text": "pretty wide basin around one that you can<00:51:25.920> sort<00:51:26.080> of<00:51:26.160> get<00:51:26.280> away<00:51:26.520> with." + }, + { + "start": 3087.75, + "duration": 0.0, + "text": "can sort of get away with." + }, + { + "start": 3087.76, + "duration": 0.0, + "text": "can sort of get away with. Okay,<00:51:27.960> but<00:51:28.120> that<00:51:28.200> one's<00:51:28.360> maybe<00:51:28.960> not<00:51:29.320> the<00:51:29.400> most" + }, + { + "start": 3089.63, + "duration": 0.0, + "text": "Okay, but that one's maybe not the most" + }, + { + "start": 3089.64, + "duration": 0.0, + "text": "Okay, but that one's maybe not the most critical<00:51:30.480> uh<00:51:30.600> hyperparameter." + }, + { + "start": 3091.95, + "duration": 0.0, + "text": "critical uh hyperparameter." + }, + { + "start": 3091.96, + "duration": 0.0, + "text": "critical uh hyperparameter. I<00:51:32.040> think<00:51:32.200> maybe<00:51:32.480> one<00:51:32.640> of<00:51:32.680> the<00:51:32.800> most<00:51:33.200> critical" + }, + { + "start": 3093.51, + "duration": 0.0, + "text": "I think maybe one of the most critical" + }, + { + "start": 3093.52, + "duration": 0.0, + "text": "I think maybe one of the most critical and<00:51:33.640> interesting<00:51:34.160> ones,<00:51:34.600> I<00:51:34.640> think" + }, + { + "start": 3094.83, + "duration": 0.0, + "text": "and interesting ones, I think" + }, + { + "start": 3094.84, + "duration": 0.0, + "text": "and interesting ones, I think conceptually,<00:51:35.640> is<00:51:35.760> this<00:51:36.000> idea<00:51:36.280> of<00:51:36.400> an<00:51:36.520> aspect" + }, + { + "start": 3096.99, + "duration": 0.0, + "text": "conceptually, is this idea of an aspect" + }, + { + "start": 3097.0, + "duration": 0.0, + "text": "conceptually, is this idea of an aspect ratio,<00:51:37.680> right?<00:51:38.480> Um<00:51:38.880> and<00:51:39.040> then<00:51:39.200> sort<00:51:39.360> of<00:51:39.440> to<00:51:39.520> add" + }, + { + "start": 3099.67, + "duration": 0.0, + "text": "ratio, right? Um and then sort of to add" + }, + { + "start": 3099.68, + "duration": 0.0, + "text": "ratio, right? Um and then sort of to add an<00:51:39.800> extra<00:51:40.200> point<00:51:40.400> here," + }, + { + "start": 3101.27, + "duration": 0.0, + "text": "an extra point here," + }, + { + "start": 3101.28, + "duration": 0.0, + "text": "an extra point here, um<00:51:41.680> when<00:51:41.840> you<00:51:41.960> scale<00:51:42.320> models<00:51:42.680> up<00:51:42.880> or<00:51:42.960> down,<00:51:43.560> the" + }, + { + "start": 3103.67, + "duration": 0.0, + "text": "um when you scale models up or down, the" + }, + { + "start": 3103.68, + "duration": 0.0, + "text": "um when you scale models up or down, the way<00:51:43.840> you<00:51:43.960> usually<00:51:44.320> do<00:51:44.480> that<00:51:44.640> is<00:51:44.760> you<00:51:44.840> fix<00:51:45.080> an" + }, + { + "start": 3105.19, + "duration": 0.0, + "text": "way you usually do that is you fix an" + }, + { + "start": 3105.2, + "duration": 0.0, + "text": "way you usually do that is you fix an aspect<00:51:45.560> ratio,<00:51:45.840> like<00:51:46.000> how<00:51:46.200> wide<00:51:46.520> your<00:51:46.640> model" + }, + { + "start": 3106.99, + "duration": 0.0, + "text": "aspect ratio, like how wide your model" + }, + { + "start": 3107.0, + "duration": 0.0, + "text": "aspect ratio, like how wide your model is<00:51:47.160> versus<00:51:47.440> how<00:51:47.560> deep<00:51:47.880> it<00:51:48.000> is,<00:51:48.480> and<00:51:48.560> then<00:51:48.640> you" + }, + { + "start": 3108.71, + "duration": 0.0, + "text": "is versus how deep it is, and then you" + }, + { + "start": 3108.72, + "duration": 0.0, + "text": "is versus how deep it is, and then you make<00:51:48.880> the<00:51:48.960> whole<00:51:49.160> model<00:51:49.480> bigger,<00:51:49.800> right?<00:51:49.960> So," + }, + { + "start": 3110.03, + "duration": 0.0, + "text": "make the whole model bigger, right? So," + }, + { + "start": 3110.04, + "duration": 0.0, + "text": "make the whole model bigger, right? So, the<00:51:50.160> aspect<00:51:50.560> ratio<00:51:50.840> in<00:51:50.920> some<00:51:51.080> sense<00:51:51.240> controls" + }, + { + "start": 3112.31, + "duration": 0.0, + "text": "the aspect ratio in some sense controls" + }, + { + "start": 3112.32, + "duration": 0.0, + "text": "the aspect ratio in some sense controls the<00:51:52.440> entire<00:51:53.040> depth-to-width<00:51:53.800> tradeoff<00:51:54.560> as" + }, + { + "start": 3114.71, + "duration": 0.0, + "text": "the entire depth-to-width tradeoff as" + }, + { + "start": 3114.72, + "duration": 0.0, + "text": "the entire depth-to-width tradeoff as you<00:51:54.800> make<00:51:55.040> models<00:51:55.400> bigger,<00:51:55.960> right?" + }, + { + "start": 3117.11, + "duration": 0.0, + "text": "you make models bigger, right?" + }, + { + "start": 3117.12, + "duration": 0.0, + "text": "you make models bigger, right? Now,<00:51:57.680> you<00:51:57.760> might<00:51:58.000> wonder<00:51:58.280> how<00:51:58.480> deep<00:51:58.840> should<00:51:59.000> my" + }, + { + "start": 3119.15, + "duration": 0.0, + "text": "Now, you might wonder how deep should my" + }, + { + "start": 3119.16, + "duration": 0.0, + "text": "Now, you might wonder how deep should my model<00:51:59.520> be.<00:51:59.880> Like,<00:52:00.000> if<00:52:00.120> you've<00:52:00.240> been<00:52:00.400> following" + }, + { + "start": 3120.83, + "duration": 0.0, + "text": "model be. Like, if you've been following" + }, + { + "start": 3120.84, + "duration": 0.0, + "text": "model be. Like, if you've been following all<00:52:00.960> this<00:52:01.120> stuff<00:52:01.320> on<00:52:01.440> like<00:52:01.640> reasoning<00:52:02.320> and<00:52:02.480> so" + }, + { + "start": 3122.67, + "duration": 0.0, + "text": "all this stuff on like reasoning and so" + }, + { + "start": 3122.68, + "duration": 0.0, + "text": "all this stuff on like reasoning and so on,<00:52:02.840> you<00:52:02.920> might<00:52:03.120> think<00:52:03.800> I<00:52:03.920> need<00:52:04.080> a<00:52:04.160> really<00:52:04.400> deep" + }, + { + "start": 3124.63, + "duration": 0.0, + "text": "on, you might think I need a really deep" + }, + { + "start": 3124.64, + "duration": 0.0, + "text": "on, you might think I need a really deep model<00:52:05.120> or<00:52:05.200> really<00:52:05.440> shallow<00:52:05.840> model<00:52:06.200> if<00:52:06.320> I<00:52:06.360> want" + }, + { + "start": 3126.55, + "duration": 0.0, + "text": "model or really shallow model if I want" + }, + { + "start": 3126.56, + "duration": 0.0, + "text": "model or really shallow model if I want systems<00:52:06.960> utilization.<00:52:08.160> You<00:52:08.280> might<00:52:08.480> think" + }, + { + "start": 3128.63, + "duration": 0.0, + "text": "systems utilization. You might think" + }, + { + "start": 3128.64, + "duration": 0.0, + "text": "systems utilization. You might think that<00:52:08.760> there's<00:52:08.960> a<00:52:09.000> lot<00:52:09.200> of<00:52:09.280> sort<00:52:09.400> of<00:52:09.480> variation." + }, + { + "start": 3129.99, + "duration": 0.0, + "text": "that there's a lot of sort of variation." + }, + { + "start": 3130.0, + "duration": 0.0, + "text": "that there's a lot of sort of variation. And<00:52:10.120> there<00:52:10.280> is<00:52:10.400> a<00:52:10.440> lot<00:52:10.640> of<00:52:10.720> variation,<00:52:11.480> um<00:52:11.760> much" + }, + { + "start": 3132.07, + "duration": 0.0, + "text": "And there is a lot of variation, um much" + }, + { + "start": 3132.08, + "duration": 0.0, + "text": "And there is a lot of variation, um much more<00:52:12.240> so<00:52:12.360> than<00:52:12.520> other<00:52:12.720> hyperparameters,<00:52:14.000> but" + }, + { + "start": 3134.11, + "duration": 0.0, + "text": "more so than other hyperparameters, but" + }, + { + "start": 3134.12, + "duration": 0.0, + "text": "more so than other hyperparameters, but there's<00:52:14.320> actually<00:52:14.800> like<00:52:15.040> a<00:52:15.400> fairly<00:52:15.800> clear" + }, + { + "start": 3136.07, + "duration": 0.0, + "text": "there's actually like a fairly clear" + }, + { + "start": 3136.08, + "duration": 0.0, + "text": "there's actually like a fairly clear sweet<00:52:16.480> spot<00:52:16.800> that<00:52:17.400> most<00:52:17.880> modern<00:52:18.240> models<00:52:18.600> fall" + }, + { + "start": 3138.87, + "duration": 0.0, + "text": "sweet spot that most modern models fall" + }, + { + "start": 3138.88, + "duration": 0.0, + "text": "sweet spot that most modern models fall into." + }, + { + "start": 3139.87, + "duration": 0.0, + "text": "into." + }, + { + "start": 3139.88, + "duration": 0.0, + "text": "into. Um<00:52:20.360> you<00:52:20.440> don't<00:52:20.640> really<00:52:20.840> see<00:52:21.000> models<00:52:21.400> go<00:52:21.800> like" + }, + { + "start": 3142.67, + "duration": 0.0, + "text": "Um you don't really see models go like" + }, + { + "start": 3142.68, + "duration": 0.0, + "text": "Um you don't really see models go like too<00:52:23.560> uh<00:52:23.720> too<00:52:24.040> deep,<00:52:24.880> um<00:52:25.120> and<00:52:25.240> you<00:52:25.320> also<00:52:25.560> don't" + }, + { + "start": 3145.75, + "duration": 0.0, + "text": "too uh too deep, um and you also don't" + }, + { + "start": 3145.76, + "duration": 0.0, + "text": "too uh too deep, um and you also don't see<00:52:25.880> models<00:52:26.240> go<00:52:26.440> too<00:52:27.120> wide<00:52:27.960> uh<00:52:28.040> in<00:52:28.200> either" + }, + { + "start": 3148.43, + "duration": 0.0, + "text": "see models go too wide uh in either" + }, + { + "start": 3148.44, + "duration": 0.0, + "text": "see models go too wide uh in either direction,<00:52:28.840> right?<00:52:29.040> You<00:52:29.160> see<00:52:29.320> most<00:52:29.720> models" + }, + { + "start": 3150.31, + "duration": 0.0, + "text": "direction, right? You see most models" + }, + { + "start": 3150.32, + "duration": 0.0, + "text": "direction, right? You see most models have<00:52:30.440> a<00:52:30.520> ratio<00:52:30.840> about<00:52:31.000> a<00:52:31.040> hundred<00:52:32.120> um<00:52:32.320> D<00:52:32.560> model" + }, + { + "start": 3152.95, + "duration": 0.0, + "text": "have a ratio about a hundred um D model" + }, + { + "start": 3152.96, + "duration": 0.0, + "text": "have a ratio about a hundred um D model over<00:52:33.280> N<00:52:33.480> layers.<00:52:34.400> Um<00:52:34.600> so,<00:52:35.000> about<00:52:35.360> hundred<00:52:35.880> sort" + }, + { + "start": 3155.99, + "duration": 0.0, + "text": "over N layers. Um so, about hundred sort" + }, + { + "start": 3156.0, + "duration": 0.0, + "text": "over N layers. Um so, about hundred sort of<00:52:36.240> width<00:52:36.920> for<00:52:37.120> every<00:52:37.360> layer<00:52:38.000> that<00:52:38.160> you<00:52:38.280> have." + }, + { + "start": 3159.11, + "duration": 0.0, + "text": "of width for every layer that you have." + }, + { + "start": 3159.12, + "duration": 0.0, + "text": "of width for every layer that you have. I<00:52:39.160> mean,<00:52:39.320> this<00:52:39.440> is<00:52:39.560> true<00:52:39.760> for<00:52:39.840> like<00:52:40.000> GPT-3<00:52:40.600> or" + }, + { + "start": 3160.71, + "duration": 0.0, + "text": "I mean, this is true for like GPT-3 or" + }, + { + "start": 3160.72, + "duration": 0.0, + "text": "I mean, this is true for like GPT-3 or LLaMA<00:52:41.360> or<00:52:41.520> any<00:52:41.720> one<00:52:41.840> of<00:52:41.920> these<00:52:42.080> models." + }, + { + "start": 3163.55, + "duration": 0.0, + "text": "LLaMA or any one of these models." + }, + { + "start": 3163.56, + "duration": 0.0, + "text": "LLaMA or any one of these models. Um" + }, + { + "start": 3164.87, + "duration": 0.0, + "text": "Um" + }, + { + "start": 3164.88, + "duration": 0.0, + "text": "Um and<00:52:45.600> really,<00:52:46.040> I<00:52:46.080> think<00:52:46.280> the<00:52:46.360> considerations" + }, + { + "start": 3167.23, + "duration": 0.0, + "text": "and really, I think the considerations" + }, + { + "start": 3167.24, + "duration": 0.0, + "text": "and really, I think the considerations are<00:52:47.400> partly<00:52:47.840> a<00:52:47.920> tradeoff<00:52:48.440> between" + }, + { + "start": 3168.67, + "duration": 0.0, + "text": "are partly a tradeoff between" + }, + { + "start": 3168.68, + "duration": 0.0, + "text": "are partly a tradeoff between expressiveness<00:52:49.840> and<00:52:50.000> hardware,<00:52:50.440> right?<00:52:50.640> If" + }, + { + "start": 3170.75, + "duration": 0.0, + "text": "expressiveness and hardware, right? If" + }, + { + "start": 3170.76, + "duration": 0.0, + "text": "expressiveness and hardware, right? If you<00:52:50.800> have<00:52:50.920> an<00:52:50.960> extremely<00:52:51.880> extremely<00:52:52.360> deep" + }, + { + "start": 3172.59, + "duration": 0.0, + "text": "you have an extremely extremely deep" + }, + { + "start": 3172.6, + "duration": 0.0, + "text": "you have an extremely extremely deep model,<00:52:53.520> um<00:52:53.840> they<00:52:54.000> get<00:52:54.200> very<00:52:54.560> very<00:52:54.840> annoying<00:52:55.160> to" + }, + { + "start": 3175.27, + "duration": 0.0, + "text": "model, um they get very very annoying to" + }, + { + "start": 3175.28, + "duration": 0.0, + "text": "model, um they get very very annoying to deal<00:52:55.520> with<00:52:55.680> systems-wise.<00:52:56.360> The<00:52:56.480> deeper<00:52:56.880> your" + }, + { + "start": 3177.03, + "duration": 0.0, + "text": "deal with systems-wise. The deeper your" + }, + { + "start": 3177.04, + "duration": 0.0, + "text": "deal with systems-wise. The deeper your model,<00:52:57.520> like,<00:52:57.840> what<00:52:58.040> is<00:52:58.160> the<00:52:58.280> ways<00:52:58.560> that<00:52:58.680> you" + }, + { + "start": 3178.75, + "duration": 0.0, + "text": "model, like, what is the ways that you" + }, + { + "start": 3178.76, + "duration": 0.0, + "text": "model, like, what is the ways that you have<00:52:58.920> for<00:52:59.040> parallelizing<00:52:59.680> them?<00:53:00.320> Well,<00:53:00.600> you" + }, + { + "start": 3180.71, + "duration": 0.0, + "text": "have for parallelizing them? Well, you" + }, + { + "start": 3180.72, + "duration": 0.0, + "text": "have for parallelizing them? Well, you might<00:53:00.920> have<00:53:01.120> to<00:53:01.240> cut<00:53:01.520> up<00:53:01.640> your<00:53:01.800> layers.<00:53:02.320> If<00:53:02.440> you" + }, + { + "start": 3182.55, + "duration": 0.0, + "text": "might have to cut up your layers. If you" + }, + { + "start": 3182.56, + "duration": 0.0, + "text": "might have to cut up your layers. If you cut<00:53:02.760> up<00:53:02.880> your<00:53:03.000> layers," + }, + { + "start": 3184.19, + "duration": 0.0, + "text": "cut up your layers," + }, + { + "start": 3184.2, + "duration": 0.0, + "text": "cut up your layers, we'll<00:53:04.320> talk<00:53:04.520> about<00:53:04.760> this<00:53:04.880> in<00:53:05.000> the<00:53:05.080> systems" + }, + { + "start": 3185.43, + "duration": 0.0, + "text": "we'll talk about this in the systems" + }, + { + "start": 3185.44, + "duration": 0.0, + "text": "we'll talk about this in the systems lecture.<00:53:05.720> Once<00:53:05.920> you<00:53:06.000> start<00:53:06.320> cutting<00:53:06.560> up<00:53:06.680> your" + }, + { + "start": 3186.83, + "duration": 0.0, + "text": "lecture. Once you start cutting up your" + }, + { + "start": 3186.84, + "duration": 0.0, + "text": "lecture. Once you start cutting up your layers<00:53:07.680> um<00:53:08.120> depth-wise,<00:53:09.120> you<00:53:09.200> have<00:53:09.400> very" + }, + { + "start": 3189.75, + "duration": 0.0, + "text": "layers um depth-wise, you have very" + }, + { + "start": 3189.76, + "duration": 0.0, + "text": "layers um depth-wise, you have very serious<00:53:10.320> issues<00:53:10.640> in<00:53:10.760> parallelization." + }, + { + "start": 3191.51, + "duration": 0.0, + "text": "serious issues in parallelization." + }, + { + "start": 3191.52, + "duration": 0.0, + "text": "serious issues in parallelization. Pipeline<00:53:11.960> parallel,<00:53:12.360> which<00:53:12.520> is<00:53:12.600> what<00:53:12.720> this<00:53:12.880> is" + }, + { + "start": 3192.99, + "duration": 0.0, + "text": "Pipeline parallel, which is what this is" + }, + { + "start": 3193.0, + "duration": 0.0, + "text": "Pipeline parallel, which is what this is called,<00:53:13.720> is<00:53:13.840> something<00:53:14.120> that<00:53:14.320> like<00:53:14.480> most" + }, + { + "start": 3194.91, + "duration": 0.0, + "text": "called, is something that like most" + }, + { + "start": 3194.92, + "duration": 0.0, + "text": "called, is something that like most people<00:53:15.280> really<00:53:15.640> really<00:53:15.960> do<00:53:16.120> not<00:53:16.320> want<00:53:16.480> to<00:53:16.560> deal" + }, + { + "start": 3196.79, + "duration": 0.0, + "text": "people really really do not want to deal" + }, + { + "start": 3196.8, + "duration": 0.0, + "text": "people really really do not want to deal with.<00:53:17.480> Whereas<00:53:17.800> width<00:53:18.280> is<00:53:18.440> much<00:53:18.960> easier<00:53:19.240> to" + }, + { + "start": 3199.31, + "duration": 0.0, + "text": "with. Whereas width is much easier to" + }, + { + "start": 3199.32, + "duration": 0.0, + "text": "with. Whereas width is much easier to parallelize.<00:53:19.800> If<00:53:19.840> you<00:53:19.880> have<00:53:20.000> a<00:53:20.080> really<00:53:20.440> wide" + }, + { + "start": 3200.75, + "duration": 0.0, + "text": "parallelize. If you have a really wide" + }, + { + "start": 3200.76, + "duration": 0.0, + "text": "parallelize. If you have a really wide model,<00:53:21.440> you<00:53:21.480> know,<00:53:21.560> you<00:53:21.640> can<00:53:21.800> cut<00:53:22.000> that<00:53:22.200> up" + }, + { + "start": 3202.63, + "duration": 0.0, + "text": "model, you know, you can cut that up" + }, + { + "start": 3202.64, + "duration": 0.0, + "text": "model, you know, you can cut that up very<00:53:23.120> easily<00:53:23.560> in<00:53:23.640> your<00:53:23.760> GPUs.<00:53:24.640> Uh<00:53:24.720> tensor" + }, + { + "start": 3205.07, + "duration": 0.0, + "text": "very easily in your GPUs. Uh tensor" + }, + { + "start": 3205.08, + "duration": 0.0, + "text": "very easily in your GPUs. Uh tensor parallel<00:53:25.480> is<00:53:25.600> what<00:53:25.720> it's<00:53:25.840> called<00:53:26.080> is<00:53:26.160> much" + }, + { + "start": 3206.47, + "duration": 0.0, + "text": "parallel is what it's called is much" + }, + { + "start": 3206.48, + "duration": 0.0, + "text": "parallel is what it's called is much much<00:53:26.840> simpler<00:53:27.200> to<00:53:27.320> deal<00:53:27.520> with." + }, + { + "start": 3208.75, + "duration": 0.0, + "text": "much simpler to deal with." + }, + { + "start": 3208.76, + "duration": 0.0, + "text": "much simpler to deal with. Um" + }, + { + "start": 3209.55, + "duration": 0.0, + "text": "Um" + }, + { + "start": 3209.56, + "duration": 0.0, + "text": "Um and<00:53:29.680> so,<00:53:29.960> in<00:53:30.080> some<00:53:30.320> sense,<00:53:30.760> you<00:53:30.840> know,<00:53:30.920> there's" + }, + { + "start": 3211.11, + "duration": 0.0, + "text": "and so, in some sense, you know, there's" + }, + { + "start": 3211.12, + "duration": 0.0, + "text": "and so, in some sense, you know, there's systems<00:53:31.560> reasons<00:53:31.920> to<00:53:32.000> go<00:53:32.200> wide,<00:53:32.680> and<00:53:32.760> maybe" + }, + { + "start": 3212.95, + "duration": 0.0, + "text": "systems reasons to go wide, and maybe" + }, + { + "start": 3212.96, + "duration": 0.0, + "text": "systems reasons to go wide, and maybe there's<00:53:33.160> expressiveness<00:53:33.880> reasons<00:53:34.200> to<00:53:34.320> go" + }, + { + "start": 3214.43, + "duration": 0.0, + "text": "there's expressiveness reasons to go" + }, + { + "start": 3214.44, + "duration": 0.0, + "text": "there's expressiveness reasons to go deep,<00:53:35.240> and<00:53:35.360> you<00:53:35.480> end<00:53:35.720> up<00:53:36.160> at<00:53:36.480> roughly<00:53:36.920> a" + }, + { + "start": 3216.99, + "duration": 0.0, + "text": "deep, and you end up at roughly a" + }, + { + "start": 3217.0, + "duration": 0.0, + "text": "deep, and you end up at roughly a hundred.<00:53:38.040> Um<00:53:39.000> and<00:53:40.000> I<00:53:40.040> think<00:53:40.280> one<00:53:40.400> of<00:53:40.480> the" + }, + { + "start": 3220.55, + "duration": 0.0, + "text": "hundred. Um and I think one of the" + }, + { + "start": 3220.56, + "duration": 0.0, + "text": "hundred. Um and I think one of the really<00:53:40.800> interesting<00:53:41.240> things<00:53:41.480> about<00:53:42.280> um" + }, + { + "start": 3223.03, + "duration": 0.0, + "text": "really interesting things about um" + }, + { + "start": 3223.04, + "duration": 0.0, + "text": "really interesting things about um transformer<00:53:43.640> hyperparameters<00:53:44.720> is<00:53:44.840> there<00:53:44.960> are" + }, + { + "start": 3225.03, + "duration": 0.0, + "text": "transformer hyperparameters is there are" + }, + { + "start": 3225.04, + "duration": 0.0, + "text": "transformer hyperparameters is there are a<00:53:45.080> lot<00:53:45.320> of<00:53:45.400> hyperparameters<00:53:46.040> that<00:53:46.160> seem<00:53:46.400> quite" + }, + { + "start": 3226.59, + "duration": 0.0, + "text": "a lot of hyperparameters that seem quite" + }, + { + "start": 3226.6, + "duration": 0.0, + "text": "a lot of hyperparameters that seem quite important,<00:53:47.480> but<00:53:47.600> they're<00:53:47.720> also<00:53:48.000> fairly" + }, + { + "start": 3228.43, + "duration": 0.0, + "text": "important, but they're also fairly" + }, + { + "start": 3228.44, + "duration": 0.0, + "text": "important, but they're also fairly forgiving,<00:53:49.000> and<00:53:49.120> people<00:53:49.320> have<00:53:49.440> converged" + }, + { + "start": 3230.11, + "duration": 0.0, + "text": "forgiving, and people have converged" + }, + { + "start": 3230.12, + "duration": 0.0, + "text": "forgiving, and people have converged roughly<00:53:50.520> on<00:53:50.640> the<00:53:50.720> minimum.<00:53:51.720> This<00:53:51.800> is<00:53:51.920> yet" + }, + { + "start": 3232.15, + "duration": 0.0, + "text": "roughly on the minimum. This is yet" + }, + { + "start": 3232.16, + "duration": 0.0, + "text": "roughly on the minimum. This is yet another<00:53:52.480> plot<00:53:52.800> from<00:53:52.960> Kaplan<00:53:53.359> et<00:53:53.480> al.,<00:53:54.040> um" + }, + { + "start": 3234.27, + "duration": 0.0, + "text": "another plot from Kaplan et al., um" + }, + { + "start": 3234.28, + "duration": 0.0, + "text": "another plot from Kaplan et al., um which<00:53:54.520> shows<00:53:54.720> another<00:53:55.040> sweep<00:53:55.400> over" + }, + { + "start": 3235.59, + "duration": 0.0, + "text": "which shows another sweep over" + }, + { + "start": 3235.6, + "duration": 0.0, + "text": "which shows another sweep over hyperparameters<00:53:56.520> for<00:53:56.880> differently<00:53:57.320> sized" + }, + { + "start": 3237.63, + "duration": 0.0, + "text": "hyperparameters for differently sized" + }, + { + "start": 3237.64, + "duration": 0.0, + "text": "hyperparameters for differently sized models." + }, + { + "start": 3238.67, + "duration": 0.0, + "text": "models." + }, + { + "start": 3238.68, + "duration": 0.0, + "text": "models. Um<00:53:58.840> and<00:53:58.920> once<00:53:59.120> again,<00:53:59.400> you<00:53:59.520> see,<00:54:00.040> regardless" + }, + { + "start": 3240.55, + "duration": 0.0, + "text": "Um and once again, you see, regardless" + }, + { + "start": 3240.56, + "duration": 0.0, + "text": "Um and once again, you see, regardless of<00:54:00.680> kind<00:54:00.800> of<00:54:00.880> the<00:54:01.000> size<00:54:01.480> of<00:54:01.600> your<00:54:01.760> model," + }, + { + "start": 3242.51, + "duration": 0.0, + "text": "of kind of the size of your model," + }, + { + "start": 3242.52, + "duration": 0.0, + "text": "of kind of the size of your model, roughly<00:54:02.920> speaking,<00:54:03.400> the<00:54:03.520> optimum<00:54:04.040> aspect" + }, + { + "start": 3244.47, + "duration": 0.0, + "text": "roughly speaking, the optimum aspect" + }, + { + "start": 3244.48, + "duration": 0.0, + "text": "roughly speaking, the optimum aspect ratio<00:54:04.880> is<00:54:05.440> fairly<00:54:05.800> similar,<00:54:06.760> and<00:54:06.920> they<00:54:07.080> live" + }, + { + "start": 3247.95, + "duration": 0.0, + "text": "ratio is fairly similar, and they live" + }, + { + "start": 3247.96, + "duration": 0.0, + "text": "ratio is fairly similar, and they live at<00:54:08.120> about<00:54:08.359> a<00:54:08.400> hundred,<00:54:08.840> maybe<00:54:09.120> a<00:54:09.200> little<00:54:09.400> bit" + }, + { + "start": 3249.59, + "duration": 0.0, + "text": "at about a hundred, maybe a little bit" + }, + { + "start": 3249.6, + "duration": 0.0, + "text": "at about a hundred, maybe a little bit less<00:54:09.840> depending<00:54:10.200> on<00:54:10.280> how<00:54:10.440> you<00:54:10.520> want<00:54:10.640> to<00:54:10.720> do<00:54:10.800> the" + }, + { + "start": 3250.87, + "duration": 0.0, + "text": "less depending on how you want to do the" + }, + { + "start": 3250.88, + "duration": 0.0, + "text": "less depending on how you want to do the accounting,<00:54:11.560> but<00:54:11.720> really,<00:54:12.359> you<00:54:12.440> know," + }, + { + "start": 3252.55, + "duration": 0.0, + "text": "accounting, but really, you know," + }, + { + "start": 3252.56, + "duration": 0.0, + "text": "accounting, but really, you know, anywhere<00:54:12.920> near<00:54:13.120> a<00:54:13.160> hundred<00:54:13.560> is<00:54:13.680> a<00:54:13.760> pretty<00:54:14.200> safe" + }, + { + "start": 3254.51, + "duration": 0.0, + "text": "anywhere near a hundred is a pretty safe" + }, + { + "start": 3254.52, + "duration": 0.0, + "text": "anywhere near a hundred is a pretty safe bet<00:54:15.160> for<00:54:15.359> aspect<00:54:15.840> ratios." + }, + { + "start": 3257.39, + "duration": 0.0, + "text": "bet for aspect ratios." + }, + { + "start": 3257.4, + "duration": 0.0, + "text": "bet for aspect ratios. Um<00:54:18.080> ETA<00:54:18.480> and<00:54:18.680> others<00:54:19.480> uh<00:54:19.760> did<00:54:20.280> a<00:54:20.359> number<00:54:20.840> of" + }, + { + "start": 3261.07, + "duration": 0.0, + "text": "Um ETA and others uh did a number of" + }, + { + "start": 3261.08, + "duration": 0.0, + "text": "Um ETA and others uh did a number of really<00:54:21.320> interesting<00:54:21.880> sort<00:54:22.040> of<00:54:22.120> like" + }, + { + "start": 3262.31, + "duration": 0.0, + "text": "really interesting sort of like" + }, + { + "start": 3262.32, + "duration": 0.0, + "text": "really interesting sort of like architecture<00:54:23.600> um" + }, + { + "start": 3264.43, + "duration": 0.0, + "text": "architecture um" + }, + { + "start": 3264.44, + "duration": 0.0, + "text": "architecture um uh" + }, + { + "start": 3265.11, + "duration": 0.0, + "text": "uh" + }, + { + "start": 3265.12, + "duration": 0.0, + "text": "uh architecture<00:54:25.520> variation<00:54:26.000> experiments,<00:54:27.120> in" + }, + { + "start": 3267.19, + "duration": 0.0, + "text": "architecture variation experiments, in" + }, + { + "start": 3267.2, + "duration": 0.0, + "text": "architecture variation experiments, in which<00:54:28.120> their<00:54:28.280> general<00:54:28.680> conclusion<00:54:29.240> on<00:54:29.400> this" + }, + { + "start": 3269.71, + "duration": 0.0, + "text": "which their general conclusion on this" + }, + { + "start": 3269.72, + "duration": 0.0, + "text": "which their general conclusion on this was<00:54:29.960> that<00:54:30.720> let's<00:54:31.040> look<00:54:31.200> at<00:54:31.280> the<00:54:31.359> top<00:54:31.680> panel" + }, + { + "start": 3271.95, + "duration": 0.0, + "text": "was that let's look at the top panel" + }, + { + "start": 3271.96, + "duration": 0.0, + "text": "was that let's look at the top panel here.<00:54:32.240> You<00:54:32.359> have<00:54:32.480> a<00:54:32.520> lot<00:54:32.840> of<00:54:32.960> different<00:54:33.320> kinds" + }, + { + "start": 3273.79, + "duration": 0.0, + "text": "here. You have a lot of different kinds" + }, + { + "start": 3273.8, + "duration": 0.0, + "text": "here. You have a lot of different kinds of" + }, + { + "start": 3274.75, + "duration": 0.0, + "text": "of" + }, + { + "start": 3274.76, + "duration": 0.0, + "text": "of uh<00:54:34.880> models<00:54:35.240> that<00:54:35.359> you<00:54:35.440> can<00:54:35.560> have<00:54:35.760> in<00:54:35.840> terms<00:54:36.040> of" + }, + { + "start": 3276.19, + "duration": 0.0, + "text": "uh models that you can have in terms of" + }, + { + "start": 3276.2, + "duration": 0.0, + "text": "uh models that you can have in terms of depth-to-width<00:54:36.880> tradeoffs,<00:54:38.000> um<00:54:38.280> but<00:54:38.480> as<00:54:38.640> you" + }, + { + "start": 3278.71, + "duration": 0.0, + "text": "depth-to-width tradeoffs, um but as you" + }, + { + "start": 3278.72, + "duration": 0.0, + "text": "depth-to-width tradeoffs, um but as you sort<00:54:38.880> of<00:54:38.960> sweep<00:54:39.320> the<00:54:39.440> depth-to-width" + }, + { + "start": 3279.99, + "duration": 0.0, + "text": "sort of sweep the depth-to-width" + }, + { + "start": 3280.0, + "duration": 0.0, + "text": "sort of sweep the depth-to-width tradeoffs,<00:54:40.480> you<00:54:40.600> find<00:54:41.359> that<00:54:41.560> really,<00:54:42.440> um<00:54:42.760> the" + }, + { + "start": 3282.87, + "duration": 0.0, + "text": "tradeoffs, you find that really, um the" + }, + { + "start": 3282.88, + "duration": 0.0, + "text": "tradeoffs, you find that really, um the only<00:54:43.160> thing<00:54:43.359> that<00:54:43.520> matters<00:54:43.880> in<00:54:43.960> some<00:54:44.120> sense<00:54:44.359> is" + }, + { + "start": 3284.51, + "duration": 0.0, + "text": "only thing that matters in some sense is" + }, + { + "start": 3284.52, + "duration": 0.0, + "text": "only thing that matters in some sense is FLOPs.<00:54:44.920> As<00:54:45.040> you<00:54:45.120> increase<00:54:45.480> the<00:54:45.560> FLOPs,<00:54:45.960> the" + }, + { + "start": 3286.03, + "duration": 0.0, + "text": "FLOPs. As you increase the FLOPs, the" + }, + { + "start": 3286.04, + "duration": 0.0, + "text": "FLOPs. As you increase the FLOPs, the models<00:54:46.400> get<00:54:46.560> better,<00:54:47.200> and<00:54:47.280> that's<00:54:47.480> really" + }, + { + "start": 3287.75, + "duration": 0.0, + "text": "models get better, and that's really" + }, + { + "start": 3287.76, + "duration": 0.0, + "text": "models get better, and that's really controlling<00:54:48.520> the<00:54:48.600> majority<00:54:49.120> of<00:54:49.200> the<00:54:49.280> effects," + }, + { + "start": 3289.63, + "duration": 0.0, + "text": "controlling the majority of the effects," + }, + { + "start": 3289.64, + "duration": 0.0, + "text": "controlling the majority of the effects, not<00:54:49.840> necessarily<00:54:51.000> uh<00:54:51.080> the<00:54:51.240> aspect<00:54:51.720> ratio.<00:54:52.240> And" + }, + { + "start": 3292.39, + "duration": 0.0, + "text": "not necessarily uh the aspect ratio. And" + }, + { + "start": 3292.4, + "duration": 0.0, + "text": "not necessarily uh the aspect ratio. And so,<00:54:53.080> I<00:54:53.120> think<00:54:53.320> what<00:54:53.440> has<00:54:53.560> really<00:54:53.840> emerged<00:54:54.320> from" + }, + { + "start": 3294.47, + "duration": 0.0, + "text": "so, I think what has really emerged from" + }, + { + "start": 3294.48, + "duration": 0.0, + "text": "so, I think what has really emerged from this<00:54:54.760> is<00:54:54.880> the<00:54:55.000> sense<00:54:55.280> that<00:54:55.840> there's<00:54:56.040> a<00:54:56.120> general" + }, + { + "start": 3296.59, + "duration": 0.0, + "text": "this is the sense that there's a general" + }, + { + "start": 3296.6, + "duration": 0.0, + "text": "this is the sense that there's a general forgiving<00:54:57.120> band<00:54:57.440> of<00:54:57.560> hyperparameters<00:54:58.240> that" + }, + { + "start": 3298.39, + "duration": 0.0, + "text": "forgiving band of hyperparameters that" + }, + { + "start": 3298.4, + "duration": 0.0, + "text": "forgiving band of hyperparameters that people<00:54:58.680> tend<00:54:58.920> to<00:54:59.040> choose,<00:54:59.800> and<00:54:59.920> then<00:55:00.080> you" + }, + { + "start": 3300.19, + "duration": 0.0, + "text": "people tend to choose, and then you" + }, + { + "start": 3300.2, + "duration": 0.0, + "text": "people tend to choose, and then you really<00:55:00.520> worry<00:55:00.800> about<00:55:01.200> primarily<00:55:01.800> your" + }, + { + "start": 3301.91, + "duration": 0.0, + "text": "really worry about primarily your" + }, + { + "start": 3301.92, + "duration": 0.0, + "text": "really worry about primarily your systems<00:55:02.400> utilization<00:55:03.520> rather<00:55:03.840> than<00:55:04.160> sort<00:55:04.280> of" + }, + { + "start": 3304.349, + "duration": 0.0, + "text": "systems utilization rather than sort of" + }, + { + "start": 3304.359, + "duration": 0.0, + "text": "systems utilization rather than sort of expressiveness<00:55:05.080> concerns,<00:55:05.560> which<00:55:05.760> are<00:55:05.840> hard" + }, + { + "start": 3306.07, + "duration": 0.0, + "text": "expressiveness concerns, which are hard" + }, + { + "start": 3306.08, + "duration": 0.0, + "text": "expressiveness concerns, which are hard to<00:55:06.200> reason<00:55:06.440> about." + }, + { + "start": 3308.47, + "duration": 0.0, + "text": "to reason about." + }, + { + "start": 3308.48, + "duration": 0.0, + "text": "to reason about. Cool." + }, + { + "start": 3309.79, + "duration": 0.0, + "text": "Cool." + }, + { + "start": 3309.8, + "duration": 0.0, + "text": "Cool. Um<00:55:10.720> okay.<00:55:11.440> And<00:55:11.560> then<00:55:11.680> maybe<00:55:11.880> the<00:55:12.080> last" + }, + { + "start": 3312.51, + "duration": 0.0, + "text": "Um okay. And then maybe the last" + }, + { + "start": 3312.52, + "duration": 0.0, + "text": "Um okay. And then maybe the last hyperparameter<00:55:13.200> thing<00:55:14.000> uh<00:55:14.120> that<00:55:14.280> I<00:55:14.359> want<00:55:14.520> to" + }, + { + "start": 3314.59, + "duration": 0.0, + "text": "hyperparameter thing uh that I want to" + }, + { + "start": 3314.6, + "duration": 0.0, + "text": "hyperparameter thing uh that I want to mention<00:55:15.320> is<00:55:15.640> vocabulary<00:55:16.480> sizes.<00:55:17.520> Um" + }, + { + "start": 3318.15, + "duration": 0.0, + "text": "mention is vocabulary sizes. Um" + }, + { + "start": 3318.16, + "duration": 0.0, + "text": "mention is vocabulary sizes. Um and<00:55:18.280> this<00:55:18.440> one's<00:55:19.160> kind<00:55:19.359> of<00:55:19.440> interesting<00:55:19.920> to<00:55:20.040> me" + }, + { + "start": 3320.19, + "duration": 0.0, + "text": "and this one's kind of interesting to me" + }, + { + "start": 3320.2, + "duration": 0.0, + "text": "and this one's kind of interesting to me because<00:55:20.440> there's<00:55:20.600> a<00:55:20.680> really<00:55:21.000> clear" + }, + { + "start": 3321.349, + "duration": 0.0, + "text": "because there's a really clear" + }, + { + "start": 3321.359, + "duration": 0.0, + "text": "because there's a really clear difference<00:55:21.920> between<00:55:22.320> two<00:55:22.560> classes<00:55:23.040> of" + }, + { + "start": 3323.19, + "duration": 0.0, + "text": "difference between two classes of" + }, + { + "start": 3323.2, + "duration": 0.0, + "text": "difference between two classes of models.<00:55:24.240> Um<00:55:24.600> I<00:55:24.680> think<00:55:24.880> in<00:55:25.000> the<00:55:25.120> early<00:55:25.480> days<00:55:26.200> of" + }, + { + "start": 3326.349, + "duration": 0.0, + "text": "models. Um I think in the early days of" + }, + { + "start": 3326.359, + "duration": 0.0, + "text": "models. Um I think in the early days of a<00:55:26.400> lot<00:55:26.720> of,<00:55:26.880> you<00:55:26.960> know,<00:55:27.560> um" + }, + { + "start": 3328.27, + "duration": 0.0, + "text": "a lot of, you know, um" + }, + { + "start": 3328.28, + "duration": 0.0, + "text": "a lot of, you know, um uh<00:55:28.359> early<00:55:28.600> days<00:55:29.480> of<00:55:30.040> open-source<00:55:30.680> model" + }, + { + "start": 3330.95, + "duration": 0.0, + "text": "uh early days of open-source model" + }, + { + "start": 3330.96, + "duration": 0.0, + "text": "uh early days of open-source model training,<00:55:31.800> um" + }, + { + "start": 3332.59, + "duration": 0.0, + "text": "training, um" + }, + { + "start": 3332.6, + "duration": 0.0, + "text": "training, um there<00:55:32.720> were<00:55:32.800> a<00:55:32.840> lot<00:55:33.040> of<00:55:33.120> monolingual<00:55:33.640> models" + }, + { + "start": 3334.15, + "duration": 0.0, + "text": "there were a lot of monolingual models" + }, + { + "start": 3334.16, + "duration": 0.0, + "text": "there were a lot of monolingual models whose<00:55:34.520> only<00:55:34.840> goal<00:55:35.120> was<00:55:35.320> to<00:55:35.440> be<00:55:35.560> good<00:55:35.760> on" + }, + { + "start": 3335.87, + "duration": 0.0, + "text": "whose only goal was to be good on" + }, + { + "start": 3335.88, + "duration": 0.0, + "text": "whose only goal was to be good on English.<00:55:36.760> And<00:55:36.800> for<00:55:36.920> those<00:55:37.160> models,<00:55:37.760> you<00:55:37.880> had" + }, + { + "start": 3338.03, + "duration": 0.0, + "text": "English. And for those models, you had" + }, + { + "start": 3338.04, + "duration": 0.0, + "text": "English. And for those models, you had these<00:55:38.160> like<00:55:38.320> much<00:55:38.720> smaller<00:55:39.359> vocab<00:55:39.800> size,<00:55:40.080> in" + }, + { + "start": 3340.15, + "duration": 0.0, + "text": "these like much smaller vocab size, in" + }, + { + "start": 3340.16, + "duration": 0.0, + "text": "these like much smaller vocab size, in the<00:55:40.240> 30,000<00:55:40.920> range.<00:55:41.800> Um<00:55:42.400> and<00:55:42.560> then," + }, + { + "start": 3343.15, + "duration": 0.0, + "text": "the 30,000 range. Um and then," + }, + { + "start": 3343.16, + "duration": 0.0, + "text": "the 30,000 range. Um and then, post-LLaMA,<00:55:44.160> a<00:55:44.200> lot<00:55:44.440> of<00:55:44.560> people<00:55:45.240> were<00:55:45.359> really" + }, + { + "start": 3345.59, + "duration": 0.0, + "text": "post-LLaMA, a lot of people were really" + }, + { + "start": 3345.6, + "duration": 0.0, + "text": "post-LLaMA, a lot of people were really interested<00:55:46.040> in<00:55:46.120> multilingual<00:55:46.880> or<00:55:46.960> like" + }, + { + "start": 3347.11, + "duration": 0.0, + "text": "interested in multilingual or like" + }, + { + "start": 3347.12, + "duration": 0.0, + "text": "interested in multilingual or like production<00:55:47.520> systems.<00:55:47.880> So,<00:55:48.080> these<00:55:48.280> include" + }, + { + "start": 3348.63, + "duration": 0.0, + "text": "production systems. So, these include" + }, + { + "start": 3348.64, + "duration": 0.0, + "text": "production systems. So, these include closed-source<00:55:49.080> models<00:55:49.359> like<00:55:49.480> GPT-4.<00:55:50.680> Um<00:55:51.040> all" + }, + { + "start": 3351.19, + "duration": 0.0, + "text": "closed-source models like GPT-4. Um all" + }, + { + "start": 3351.2, + "duration": 0.0, + "text": "closed-source models like GPT-4. Um all these<00:55:51.400> have<00:55:51.560> much<00:55:51.880> much<00:55:52.120> larger<00:55:52.440> vocab<00:55:52.880> sizes," + }, + { + "start": 3353.27, + "duration": 0.0, + "text": "these have much much larger vocab sizes," + }, + { + "start": 3353.28, + "duration": 0.0, + "text": "these have much much larger vocab sizes, and<00:55:53.400> these<00:55:53.560> are<00:55:53.760> roughly<00:55:54.280> in<00:55:54.359> the<00:55:54.440> hundred<00:55:54.880> to" + }, + { + "start": 3354.99, + "duration": 0.0, + "text": "and these are roughly in the hundred to" + }, + { + "start": 3355.0, + "duration": 0.0, + "text": "and these are roughly in the hundred to 200,000" + }, + { + "start": 3356.47, + "duration": 0.0, + "text": "200,000" + }, + { + "start": 3356.48, + "duration": 0.0, + "text": "200,000 um<00:55:56.800> vocab<00:55:57.240> range." + }, + { + "start": 3358.55, + "duration": 0.0, + "text": "um vocab range." + }, + { + "start": 3358.56, + "duration": 0.0, + "text": "um vocab range. And<00:55:58.640> you<00:55:58.720> see<00:55:58.880> generally<00:55:59.280> that,<00:55:59.480> you<00:55:59.520> know," + }, + { + "start": 3359.63, + "duration": 0.0, + "text": "And you see generally that, you know," + }, + { + "start": 3359.64, + "duration": 0.0, + "text": "And you see generally that, you know, Google<00:55:59.920> models<00:56:00.200> have<00:56:00.320> a<00:56:00.400> ton<00:56:00.640> more<00:56:00.800> vocab.<00:56:01.760> Um" + }, + { + "start": 3362.15, + "duration": 0.0, + "text": "Google models have a ton more vocab. Um" + }, + { + "start": 3362.16, + "duration": 0.0, + "text": "Google models have a ton more vocab. Um LLaMA<00:56:02.520> derivatives<00:56:03.120> roughly<00:56:03.480> range<00:56:03.760> at<00:56:03.840> about" + }, + { + "start": 3363.99, + "duration": 0.0, + "text": "LLaMA derivatives roughly range at about" + }, + { + "start": 3364.0, + "duration": 0.0, + "text": "LLaMA derivatives roughly range at about a<00:56:04.040> hundred<00:56:05.040> uh<00:56:05.160> thousand<00:56:05.560> tokens,<00:56:06.440> and<00:56:06.600> then" + }, + { + "start": 3366.91, + "duration": 0.0, + "text": "a hundred uh thousand tokens, and then" + }, + { + "start": 3366.92, + "duration": 0.0, + "text": "a hundred uh thousand tokens, and then the<00:56:07.320> the<00:56:07.400> sort<00:56:07.520> of<00:56:07.600> monolingual<00:56:08.080> models<00:56:08.320> are" + }, + { + "start": 3368.43, + "duration": 0.0, + "text": "the the sort of monolingual models are" + }, + { + "start": 3368.44, + "duration": 0.0, + "text": "the the sort of monolingual models are about<00:56:08.720> 30,000." + }, + { + "start": 3370.47, + "duration": 0.0, + "text": "about 30,000." + }, + { + "start": 3370.48, + "duration": 0.0, + "text": "about 30,000. Um<00:56:10.960> this<00:56:11.120> is<00:56:11.359> somewhat<00:56:11.720> clear.<00:56:11.960> The" + }, + { + "start": 3372.07, + "duration": 0.0, + "text": "Um this is somewhat clear. The" + }, + { + "start": 3372.08, + "duration": 0.0, + "text": "Um this is somewhat clear. The multilingual<00:56:12.680> models<00:56:13.000> really<00:56:13.240> do<00:56:13.400> need<00:56:13.600> much" + }, + { + "start": 3373.79, + "duration": 0.0, + "text": "multilingual models really do need much" + }, + { + "start": 3373.8, + "duration": 0.0, + "text": "multilingual models really do need much larger<00:56:14.120> vocab<00:56:14.600> to<00:56:14.680> cover<00:56:14.960> the<00:56:15.080> whole<00:56:15.280> space." + }, + { + "start": 3376.07, + "duration": 0.0, + "text": "larger vocab to cover the whole space." + }, + { + "start": 3376.08, + "duration": 0.0, + "text": "larger vocab to cover the whole space. Generally,<00:56:16.560> the<00:56:16.640> models<00:56:16.960> on<00:56:17.040> the<00:56:17.160> right<00:56:17.320> are" + }, + { + "start": 3377.43, + "duration": 0.0, + "text": "Generally, the models on the right are" + }, + { + "start": 3377.44, + "duration": 0.0, + "text": "Generally, the models on the right are also<00:56:17.680> bigger.<00:56:18.080> There<00:56:18.280> have<00:56:18.400> been<00:56:18.520> scaling<00:56:18.960> law" + }, + { + "start": 3379.11, + "duration": 0.0, + "text": "also bigger. There have been scaling law" + }, + { + "start": 3379.12, + "duration": 0.0, + "text": "also bigger. There have been scaling law studies<00:56:19.520> showing<00:56:19.840> that<00:56:20.000> the<00:56:20.080> bigger<00:56:20.400> your" + }, + { + "start": 3380.51, + "duration": 0.0, + "text": "studies showing that the bigger your" + }, + { + "start": 3380.52, + "duration": 0.0, + "text": "studies showing that the bigger your model,<00:56:21.280> the<00:56:21.520> larger<00:56:22.000> the<00:56:22.120> vocab<00:56:22.480> it<00:56:22.600> can" + }, + { + "start": 3382.75, + "duration": 0.0, + "text": "model, the larger the vocab it can" + }, + { + "start": 3382.76, + "duration": 0.0, + "text": "model, the larger the vocab it can handle,<00:56:23.440> and<00:56:23.560> so<00:56:23.640> this<00:56:23.800> is<00:56:23.920> also<00:56:24.280> partially" + }, + { + "start": 3384.75, + "duration": 0.0, + "text": "handle, and so this is also partially" + }, + { + "start": 3384.76, + "duration": 0.0, + "text": "handle, and so this is also partially driven<00:56:25.160> by<00:56:25.920> uh<00:56:26.000> modern<00:56:26.359> scaling<00:56:26.720> trends," + }, + { + "start": 3387.03, + "duration": 0.0, + "text": "driven by uh modern scaling trends," + }, + { + "start": 3387.04, + "duration": 0.0, + "text": "driven by uh modern scaling trends, where<00:56:27.120> the<00:56:27.200> models<00:56:27.520> on<00:56:27.600> the<00:56:27.720> right<00:56:27.920> are" + }, + { + "start": 3387.99, + "duration": 0.0, + "text": "where the models on the right are" + }, + { + "start": 3388.0, + "duration": 0.0, + "text": "where the models on the right are generally<00:56:28.480> bigger.<00:56:28.760> No<00:56:28.880> one's<00:56:29.080> training" + }, + { + "start": 3389.43, + "duration": 0.0, + "text": "generally bigger. No one's training" + }, + { + "start": 3389.44, + "duration": 0.0, + "text": "generally bigger. No one's training small<00:56:30.120> uh<00:56:30.200> sorry,<00:56:30.400> large<00:56:30.760> monolingual<00:56:31.320> models" + }, + { + "start": 3392.15, + "duration": 0.0, + "text": "small uh sorry, large monolingual models" + }, + { + "start": 3392.16, + "duration": 0.0, + "text": "small uh sorry, large monolingual models uh<00:56:32.280> anymore." + }, + { + "start": 3395.56, + "duration": 0.0, + "text": "Okay.<00:56:36.120> So,<00:56:36.640> um<00:56:37.120> yeah." + }, + { + "start": 3408.92, + "duration": 0.0, + "text": "Sorry,<00:56:49.160> uh<00:56:49.280> the<00:56:49.400> question<00:56:49.640> was<00:56:49.760> like,<00:56:50.280> if<00:56:50.440> you" + }, + { + "start": 3410.55, + "duration": 0.0, + "text": "Sorry, uh the question was like, if you" + }, + { + "start": 3410.56, + "duration": 0.0, + "text": "Sorry, uh the question was like, if you have<00:56:51.520> Sorry,<00:56:51.840> multilingual<00:56:52.280> models<00:56:52.600> or" + }, + { + "start": 3412.71, + "duration": 0.0, + "text": "have Sorry, multilingual models or" + }, + { + "start": 3412.72, + "duration": 0.0, + "text": "have Sorry, multilingual models or sorry?<00:56:52.960> Multimodal.<00:56:53.680> Multimodal.<00:56:54.640> Yeah,<00:56:54.800> so" + }, + { + "start": 3415.43, + "duration": 0.0, + "text": "sorry? Multimodal. Multimodal. Yeah, so" + }, + { + "start": 3415.44, + "duration": 0.0, + "text": "sorry? Multimodal. Multimodal. Yeah, so I<00:56:55.520> guess<00:56:55.760> it<00:56:55.880> depends<00:56:56.480> on<00:56:57.080> the<00:56:57.320> way<00:56:57.520> that<00:56:57.640> your" + }, + { + "start": 3417.71, + "duration": 0.0, + "text": "I guess it depends on the way that your" + }, + { + "start": 3417.72, + "duration": 0.0, + "text": "I guess it depends on the way that your tokens<00:56:57.960> are<00:56:58.040> encoded,<00:56:58.560> but,<00:56:58.960> you<00:56:59.040> know,<00:56:59.160> if" + }, + { + "start": 3419.27, + "duration": 0.0, + "text": "tokens are encoded, but, you know, if" + }, + { + "start": 3419.28, + "duration": 0.0, + "text": "tokens are encoded, but, you know, if you're<00:56:59.359> tokenizing<00:56:59.840> your<00:56:59.960> images<00:57:00.280> and<00:57:00.359> things" + }, + { + "start": 3420.55, + "duration": 0.0, + "text": "you're tokenizing your images and things" + }, + { + "start": 3420.56, + "duration": 0.0, + "text": "you're tokenizing your images and things like<00:57:00.720> that,<00:57:00.920> then<00:57:01.080> you<00:57:01.160> need<00:57:01.320> to,<00:57:01.760> you<00:57:01.840> know," + }, + { + "start": 3421.91, + "duration": 0.0, + "text": "like that, then you need to, you know," + }, + { + "start": 3421.92, + "duration": 0.0, + "text": "like that, then you need to, you know, have<00:57:02.240> many<00:57:02.480> more<00:57:02.640> tokens<00:57:03.000> to<00:57:03.080> account<00:57:03.320> for" + }, + { + "start": 3423.43, + "duration": 0.0, + "text": "have many more tokens to account for" + }, + { + "start": 3423.44, + "duration": 0.0, + "text": "have many more tokens to account for those.<00:57:04.120> Um<00:57:04.359> if<00:57:04.480> you<00:57:04.560> look<00:57:04.720> at<00:57:04.880> like<00:57:05.040> various" + }, + { + "start": 3425.47, + "duration": 0.0, + "text": "those. Um if you look at like various" + }, + { + "start": 3425.48, + "duration": 0.0, + "text": "those. Um if you look at like various open-source<00:57:05.960> releases,<00:57:06.440> they'll<00:57:06.560> have<00:57:06.720> like" + }, + { + "start": 3426.83, + "duration": 0.0, + "text": "open-source releases, they'll have like" + }, + { + "start": 3426.84, + "duration": 0.0, + "text": "open-source releases, they'll have like a<00:57:06.880> different<00:57:07.200> image<00:57:07.400> tokenizer<00:57:07.880> with<00:57:08.000> its<00:57:08.120> own" + }, + { + "start": 3428.31, + "duration": 0.0, + "text": "a different image tokenizer with its own" + }, + { + "start": 3428.32, + "duration": 0.0, + "text": "a different image tokenizer with its own vocab,<00:57:08.720> which<00:57:08.840> is<00:57:09.000> quite<00:57:09.240> large." + }, + { + "start": 3430.75, + "duration": 0.0, + "text": "vocab, which is quite large." + }, + { + "start": 3430.76, + "duration": 0.0, + "text": "vocab, which is quite large. Yeah." + }, + { + "start": 3431.63, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 3431.64, + "duration": 0.0, + "text": "Yeah. Uh<00:57:11.960> how<00:57:12.120> valid<00:57:12.400> is<00:57:12.520> it<00:57:12.640> to" + }, + { + "start": 3433.87, + "duration": 0.0, + "text": "Uh how valid is it to" + }, + { + "start": 3433.88, + "duration": 0.0, + "text": "Uh how valid is it to compare" + }, + { + "start": 3435.15, + "duration": 0.0, + "text": "compare" + }, + { + "start": 3435.16, + "duration": 0.0, + "text": "compare um" + }, + { + "start": 3436.47, + "duration": 0.0, + "text": "um" + }, + { + "start": 3436.48, + "duration": 0.0, + "text": "um bits<00:57:17.080> uh<00:57:17.120> bits<00:57:17.359> per<00:57:17.480> byte<00:57:17.920> for<00:57:18.080> different<00:57:18.920> for" + }, + { + "start": 3439.03, + "duration": 0.0, + "text": "bits uh bits per byte for different for" + }, + { + "start": 3439.04, + "duration": 0.0, + "text": "bits uh bits per byte for different for different<00:57:19.280> tokenizers?" + }, + { + "start": 3440.79, + "duration": 0.0, + "text": "different tokenizers?" + }, + { + "start": 3440.8, + "duration": 0.0, + "text": "different tokenizers? Um" + }, + { + "start": 3441.75, + "duration": 0.0, + "text": "Um" + }, + { + "start": 3441.76, + "duration": 0.0, + "text": "Um how<00:57:21.960> valid<00:57:22.359> is<00:57:22.480> it<00:57:22.560> to<00:57:22.640> compare<00:57:22.960> bits<00:57:23.160> Oh,<00:57:23.320> that" + }, + { + "start": 3443.43, + "duration": 0.0, + "text": "how valid is it to compare bits Oh, that" + }, + { + "start": 3443.44, + "duration": 0.0, + "text": "how valid is it to compare bits Oh, that is<00:57:23.520> a<00:57:23.600> great<00:57:23.800> question.<00:57:24.080> Okay,<00:57:24.440> yeah.<00:57:25.000> Uh" + }, + { + "start": 3445.15, + "duration": 0.0, + "text": "is a great question. Okay, yeah. Uh" + }, + { + "start": 3445.16, + "duration": 0.0, + "text": "is a great question. Okay, yeah. Uh that's<00:57:25.359> not<00:57:25.520> a<00:57:25.600> hyperparameter<00:57:26.120> question," + }, + { + "start": 3446.31, + "duration": 0.0, + "text": "that's not a hyperparameter question," + }, + { + "start": 3446.32, + "duration": 0.0, + "text": "that's not a hyperparameter question, but<00:57:26.400> that<00:57:26.480> is<00:57:26.520> a<00:57:26.560> good<00:57:26.720> question.<00:57:27.400> Um<00:57:27.720> so," + }, + { + "start": 3448.79, + "duration": 0.0, + "text": "but that is a good question. Um so," + }, + { + "start": 3448.8, + "duration": 0.0, + "text": "but that is a good question. Um so, uh" + }, + { + "start": 3449.51, + "duration": 0.0, + "text": "uh" + }, + { + "start": 3449.52, + "duration": 0.0, + "text": "uh what<00:57:29.680> is<00:57:29.760> the<00:57:29.880> right<00:57:30.120> way?<00:57:30.320> Okay,<00:57:30.560> so<00:57:30.720> so,<00:57:30.840> let" + }, + { + "start": 3450.95, + "duration": 0.0, + "text": "what is the right way? Okay, so so, let" + }, + { + "start": 3450.96, + "duration": 0.0, + "text": "what is the right way? Okay, so so, let me<00:57:31.160> let<00:57:31.320> me<00:57:31.400> like" + }, + { + "start": 3452.31, + "duration": 0.0, + "text": "me let me like" + }, + { + "start": 3452.32, + "duration": 0.0, + "text": "me let me like step<00:57:32.560> back<00:57:32.760> a<00:57:32.800> moment<00:57:33.160> and<00:57:33.240> like<00:57:33.400> put<00:57:33.520> us<00:57:33.600> in" + }, + { + "start": 3453.67, + "duration": 0.0, + "text": "step back a moment and like put us in" + }, + { + "start": 3453.68, + "duration": 0.0, + "text": "step back a moment and like put us in the<00:57:33.760> right<00:57:34.240> mindset.<00:57:35.200> So,<00:57:36.040> if<00:57:36.120> we<00:57:36.240> think<00:57:36.440> about" + }, + { + "start": 3456.79, + "duration": 0.0, + "text": "the right mindset. So, if we think about" + }, + { + "start": 3456.8, + "duration": 0.0, + "text": "the right mindset. So, if we think about language<00:57:37.080> modeling,<00:57:37.600> language<00:57:37.960> modeling<00:57:38.400> is" + }, + { + "start": 3458.59, + "duration": 0.0, + "text": "language modeling, language modeling is" + }, + { + "start": 3458.6, + "duration": 0.0, + "text": "language modeling, language modeling is is<00:57:38.720> a<00:57:38.800> generative<00:57:39.240> modeling<00:57:39.600> task,<00:57:40.000> right?<00:57:40.120> We" + }, + { + "start": 3460.15, + "duration": 0.0, + "text": "is a generative modeling task, right? We" + }, + { + "start": 3460.16, + "duration": 0.0, + "text": "is a generative modeling task, right? We are<00:57:40.240> modeling<00:57:40.640> the<00:57:40.720> probability<00:57:41.240> of<00:57:41.320> a" + }, + { + "start": 3461.349, + "duration": 0.0, + "text": "are modeling the probability of a" + }, + { + "start": 3461.359, + "duration": 0.0, + "text": "are modeling the probability of a sequence." + }, + { + "start": 3462.43, + "duration": 0.0, + "text": "sequence." + }, + { + "start": 3462.44, + "duration": 0.0, + "text": "sequence. Now,<00:57:42.720> as<00:57:42.840> long<00:57:43.040> as<00:57:43.120> your<00:57:43.240> sequence<00:57:43.640> is<00:57:43.840> fixed," + }, + { + "start": 3464.59, + "duration": 0.0, + "text": "Now, as long as your sequence is fixed," + }, + { + "start": 3464.6, + "duration": 0.0, + "text": "Now, as long as your sequence is fixed, right?<00:57:44.800> It's<00:57:44.920> the<00:57:45.040> same.<00:57:45.320> You<00:57:45.359> have<00:57:45.720> you<00:57:45.840> know," + }, + { + "start": 3465.95, + "duration": 0.0, + "text": "right? It's the same. You have you know," + }, + { + "start": 3465.96, + "duration": 0.0, + "text": "right? It's the same. You have you know, adulterated<00:57:46.600> it<00:57:46.680> anyway,<00:57:47.520> and<00:57:47.640> you<00:57:47.720> provide<00:57:48.120> a" + }, + { + "start": 3468.19, + "duration": 0.0, + "text": "adulterated it anyway, and you provide a" + }, + { + "start": 3468.2, + "duration": 0.0, + "text": "adulterated it anyway, and you provide a probability<00:57:48.880> over<00:57:49.080> all<00:57:49.240> strings,<00:57:50.040> that's" + }, + { + "start": 3470.27, + "duration": 0.0, + "text": "probability over all strings, that's" + }, + { + "start": 3470.28, + "duration": 0.0, + "text": "probability over all strings, that's always<00:57:50.600> valid<00:57:50.920> to<00:57:51.000> compare,<00:57:51.640> right?<00:57:51.840> At<00:57:51.960> that" + }, + { + "start": 3472.23, + "duration": 0.0, + "text": "always valid to compare, right? At that" + }, + { + "start": 3472.24, + "duration": 0.0, + "text": "always valid to compare, right? At that level<00:57:52.520> of<00:57:52.600> things,<00:57:52.840> it's<00:57:52.960> always<00:57:53.200> valid." + }, + { + "start": 3474.19, + "duration": 0.0, + "text": "level of things, it's always valid." + }, + { + "start": 3474.2, + "duration": 0.0, + "text": "level of things, it's always valid. Now,<00:57:54.480> when<00:57:54.640> you<00:57:54.760> ask<00:57:55.000> the<00:57:55.080> question,<00:57:55.880> is<00:57:56.040> it" + }, + { + "start": 3476.15, + "duration": 0.0, + "text": "Now, when you ask the question, is it" + }, + { + "start": 3476.16, + "duration": 0.0, + "text": "Now, when you ask the question, is it valid<00:57:56.440> to<00:57:56.560> compare<00:57:57.040> the<00:57:57.160> bits<00:57:57.440> per<00:57:57.600> byte<00:57:58.320> of<00:57:59.040> uh" + }, + { + "start": 3479.11, + "duration": 0.0, + "text": "valid to compare the bits per byte of uh" + }, + { + "start": 3479.12, + "duration": 0.0, + "text": "valid to compare the bits per byte of uh arbitrary<00:57:59.600> token<00:58:00.040> or<00:58:00.400> or<00:58:00.520> two<00:58:00.720> arbitrary" + }, + { + "start": 3480.99, + "duration": 0.0, + "text": "arbitrary token or or two arbitrary" + }, + { + "start": 3481.0, + "duration": 0.0, + "text": "arbitrary token or or two arbitrary tokenizers?" + }, + { + "start": 3482.67, + "duration": 0.0, + "text": "tokenizers?" + }, + { + "start": 3482.68, + "duration": 0.0, + "text": "tokenizers? Really,<00:58:03.280> there's<00:58:03.560> two<00:58:03.720> things<00:58:03.920> at<00:58:04.080> play.<00:58:04.320> The" + }, + { + "start": 3484.43, + "duration": 0.0, + "text": "Really, there's two things at play. The" + }, + { + "start": 3484.44, + "duration": 0.0, + "text": "Really, there's two things at play. The one<00:58:04.600> thing<00:58:04.760> is,<00:58:05.320> you<00:58:05.400> know,<00:58:05.560> did<00:58:05.760> you<00:58:05.960> touch" + }, + { + "start": 3486.27, + "duration": 0.0, + "text": "one thing is, you know, did you touch" + }, + { + "start": 3486.28, + "duration": 0.0, + "text": "one thing is, you know, did you touch the<00:58:06.400> sequence<00:58:06.760> at<00:58:06.880> all?<00:58:07.160> Like,<00:58:07.280> if<00:58:07.400> you<00:58:07.520> look" + }, + { + "start": 3487.71, + "duration": 0.0, + "text": "the sequence at all? Like, if you look" + }, + { + "start": 3487.72, + "duration": 0.0, + "text": "the sequence at all? Like, if you look at<00:58:08.000> some<00:58:08.240> tokenizers<00:58:08.840> in<00:58:08.920> the<00:58:09.000> past,<00:58:09.440> before" + }, + { + "start": 3489.79, + "duration": 0.0, + "text": "at some tokenizers in the past, before" + }, + { + "start": 3489.8, + "duration": 0.0, + "text": "at some tokenizers in the past, before subword<00:58:10.160> tokenizers,<00:58:11.160> they<00:58:11.280> would<00:58:11.440> drop<00:58:11.720> some" + }, + { + "start": 3491.87, + "duration": 0.0, + "text": "subword tokenizers, they would drop some" + }, + { + "start": 3491.88, + "duration": 0.0, + "text": "subword tokenizers, they would drop some tokens<00:58:12.280> or<00:58:12.359> drop<00:58:12.600> some<00:58:12.760> words.<00:58:13.440> That<00:58:13.640> changes" + }, + { + "start": 3494.19, + "duration": 0.0, + "text": "tokens or drop some words. That changes" + }, + { + "start": 3494.2, + "duration": 0.0, + "text": "tokens or drop some words. That changes that<00:58:14.400> makes<00:58:14.640> the<00:58:14.720> comparisons<00:58:15.200> invalid.<00:58:16.040> But" + }, + { + "start": 3496.11, + "duration": 0.0, + "text": "that makes the comparisons invalid. But" + }, + { + "start": 3496.12, + "duration": 0.0, + "text": "that makes the comparisons invalid. But modern<00:58:16.440> tokenizers<00:58:16.920> are<00:58:17.000> complete.<00:58:17.520> They<00:58:17.640> can" + }, + { + "start": 3497.75, + "duration": 0.0, + "text": "modern tokenizers are complete. They can" + }, + { + "start": 3497.76, + "duration": 0.0, + "text": "modern tokenizers are complete. They can model<00:58:18.000> any<00:58:18.200> sequence,<00:58:18.600> so<00:58:18.680> that's<00:58:18.920> not<00:58:19.080> a" + }, + { + "start": 3499.11, + "duration": 0.0, + "text": "model any sequence, so that's not a" + }, + { + "start": 3499.12, + "duration": 0.0, + "text": "model any sequence, so that's not a concern.<00:58:20.120> Um<00:58:20.520> the<00:58:20.720> other<00:58:20.920> thing<00:58:21.080> that<00:58:21.240> you" + }, + { + "start": 3501.31, + "duration": 0.0, + "text": "concern. Um the other thing that you" + }, + { + "start": 3501.32, + "duration": 0.0, + "text": "concern. Um the other thing that you have<00:58:21.480> to<00:58:21.560> worry<00:58:21.800> about<00:58:22.359> is,<00:58:22.720> are<00:58:22.840> we<00:58:23.000> like" + }, + { + "start": 3503.39, + "duration": 0.0, + "text": "have to worry about is, are we like" + }, + { + "start": 3503.4, + "duration": 0.0, + "text": "have to worry about is, are we like length<00:58:23.760> normalizing<00:58:24.359> it<00:58:24.480> in<00:58:24.640> any<00:58:24.800> way,<00:58:25.120> right?" + }, + { + "start": 3505.63, + "duration": 0.0, + "text": "length normalizing it in any way, right?" + }, + { + "start": 3505.64, + "duration": 0.0, + "text": "length normalizing it in any way, right? But<00:58:25.800> for<00:58:25.920> bits<00:58:26.120> per<00:58:26.280> byte,<00:58:26.480> you're<00:58:26.640> always" + }, + { + "start": 3506.91, + "duration": 0.0, + "text": "But for bits per byte, you're always" + }, + { + "start": 3506.92, + "duration": 0.0, + "text": "But for bits per byte, you're always normalizing<00:58:27.480> with<00:58:27.600> the<00:58:27.680> same<00:58:28.280> number,<00:58:28.600> which" + }, + { + "start": 3508.79, + "duration": 0.0, + "text": "normalizing with the same number, which" + }, + { + "start": 3508.8, + "duration": 0.0, + "text": "normalizing with the same number, which is<00:58:28.880> the<00:58:28.960> number<00:58:29.200> of<00:58:29.320> bytes,<00:58:29.920> and<00:58:30.040> so<00:58:30.120> this<00:58:30.280> is" + }, + { + "start": 3510.43, + "duration": 0.0, + "text": "is the number of bytes, and so this is" + }, + { + "start": 3510.44, + "duration": 0.0, + "text": "is the number of bytes, and so this is always<00:58:30.720> a<00:58:30.760> valid<00:58:31.040> comparison,<00:58:31.640> right?<00:58:31.760> So," + }, + { + "start": 3511.83, + "duration": 0.0, + "text": "always a valid comparison, right? So," + }, + { + "start": 3511.84, + "duration": 0.0, + "text": "always a valid comparison, right? So, that's<00:58:32.000> kind<00:58:32.160> of<00:58:32.240> how<00:58:32.320> to<00:58:32.440> think<00:58:32.600> about,<00:58:33.320> you" + }, + { + "start": 3513.39, + "duration": 0.0, + "text": "that's kind of how to think about, you" + }, + { + "start": 3513.4, + "duration": 0.0, + "text": "that's kind of how to think about, you know,<00:58:33.520> tokenizer<00:58:33.880> comparisons.<00:58:34.680> So,<00:58:34.760> for" + }, + { + "start": 3514.83, + "duration": 0.0, + "text": "know, tokenizer comparisons. So, for" + }, + { + "start": 3514.84, + "duration": 0.0, + "text": "know, tokenizer comparisons. So, for example," + }, + { + "start": 3515.83, + "duration": 0.0, + "text": "example," + }, + { + "start": 3515.84, + "duration": 0.0, + "text": "example, uh<00:58:36.120> I<00:58:36.160> think<00:58:36.359> they<00:58:36.440> had<00:58:36.560> the<00:58:36.640> results<00:58:37.560> showing" + }, + { + "start": 3517.87, + "duration": 0.0, + "text": "uh I think they had the results showing" + }, + { + "start": 3517.88, + "duration": 0.0, + "text": "uh I think they had the results showing that" + }, + { + "start": 3519.15, + "duration": 0.0, + "text": "that" + }, + { + "start": 3519.16, + "duration": 0.0, + "text": "that comparing<00:58:39.520> perplexity<00:58:40.240> for<00:58:40.400> fixed" + }, + { + "start": 3520.67, + "duration": 0.0, + "text": "comparing perplexity for fixed" + }, + { + "start": 3520.68, + "duration": 0.0, + "text": "comparing perplexity for fixed tokenizers" + }, + { + "start": 3522.15, + "duration": 0.0, + "text": "tokenizers" + }, + { + "start": 3522.16, + "duration": 0.0, + "text": "tokenizers is<00:58:42.440> is<00:58:42.720> is<00:58:43.000> is<00:58:43.600> um" + }, + { + "start": 3524.79, + "duration": 0.0, + "text": "is is is is um" + }, + { + "start": 3524.8, + "duration": 0.0, + "text": "is is is is um always<00:58:45.000> leads<00:58:45.320> to<00:58:45.560> to<00:58:45.680> better<00:58:45.920> actual" + }, + { + "start": 3526.27, + "duration": 0.0, + "text": "always leads to to better actual" + }, + { + "start": 3526.28, + "duration": 0.0, + "text": "always leads to to better actual performance." + }, + { + "start": 3527.03, + "duration": 0.0, + "text": "performance." + }, + { + "start": 3527.04, + "duration": 0.0, + "text": "performance. On<00:58:47.320> on<00:58:47.480> downstream<00:58:47.880> network<00:58:48.240> tasks." + }, + { + "start": 3529.19, + "duration": 0.0, + "text": "On on downstream network tasks." + }, + { + "start": 3529.2, + "duration": 0.0, + "text": "On on downstream network tasks. Is<00:58:49.400> the<00:58:49.480> same<00:58:49.720> thing<00:58:49.920> they<00:58:50.120> were<00:58:50.200> looking<00:58:50.480> for?" + }, + { + "start": 3532.19, + "duration": 0.0, + "text": "Is the same thing they were looking for?" + }, + { + "start": 3532.2, + "duration": 0.0, + "text": "Is the same thing they were looking for? Um<00:58:52.720> perplexity<00:58:53.400> and<00:58:53.520> BPD<00:58:53.920> are<00:58:54.000> kind<00:58:54.160> of<00:58:54.280> dual" + }, + { + "start": 3534.55, + "duration": 0.0, + "text": "Um perplexity and BPD are kind of dual" + }, + { + "start": 3534.56, + "duration": 0.0, + "text": "Um perplexity and BPD are kind of dual to<00:58:54.680> each<00:58:54.840> other,<00:58:55.040> so<00:58:55.280> yes,<00:58:56.240> if<00:58:56.359> that's<00:58:56.480> what" + }, + { + "start": 3536.59, + "duration": 0.0, + "text": "to each other, so yes, if that's what" + }, + { + "start": 3536.6, + "duration": 0.0, + "text": "to each other, so yes, if that's what you're<00:58:56.720> asking." + }, + { + "start": 3538.59, + "duration": 0.0, + "text": "you're asking." + }, + { + "start": 3538.6, + "duration": 0.0, + "text": "you're asking. It's<00:58:58.760> only<00:58:59.040> yes<00:58:59.240> and<00:58:59.359> only<00:58:59.640> no,<00:59:00.080> cuz<00:59:00.480> if<00:59:00.680> you're" + }, + { + "start": 3540.79, + "duration": 0.0, + "text": "It's only yes and only no, cuz if you're" + }, + { + "start": 3540.8, + "duration": 0.0, + "text": "It's only yes and only no, cuz if you're comparing" + }, + { + "start": 3544.52, + "duration": 0.0, + "text": "you<00:59:04.600> could<00:59:05.000> two<00:59:05.160> frame<00:59:05.680> compare<00:59:06.000> the" + }, + { + "start": 3546.07, + "duration": 0.0, + "text": "you could two frame compare the" + }, + { + "start": 3546.08, + "duration": 0.0, + "text": "you could two frame compare the perplexity<00:59:06.920> as<00:59:07.040> compared" + }, + { + "start": 3548.19, + "duration": 0.0, + "text": "perplexity as compared" + }, + { + "start": 3548.2, + "duration": 0.0, + "text": "perplexity as compared but<00:59:08.359> you're<00:59:08.520> changing<00:59:09.000> it<00:59:09.200> different<00:59:09.720> splits." + }, + { + "start": 3550.91, + "duration": 0.0, + "text": "but you're changing it different splits." + }, + { + "start": 3550.92, + "duration": 0.0, + "text": "but you're changing it different splits. Changing<00:59:11.200> it<00:59:11.320> different<00:59:11.450> [snorts]" + }, + { + "start": 3552.15, + "duration": 0.0, + "text": "Changing it different [snorts]" + }, + { + "start": 3552.16, + "duration": 0.0, + "text": "Changing it different [snorts] Okay,<00:59:12.520> we'll<00:59:12.640> have<00:59:12.760> to<00:59:12.960> we'll<00:59:13.080> have<00:59:13.200> to<00:59:13.280> talk" + }, + { + "start": 3553.47, + "duration": 0.0, + "text": "Okay, we'll have to we'll have to talk" + }, + { + "start": 3553.48, + "duration": 0.0, + "text": "Okay, we'll have to we'll have to talk later<00:59:13.760> cuz<00:59:13.880> I'm<00:59:14.000> not<00:59:14.160> sure<00:59:14.359> I<00:59:14.440> understand<00:59:14.840> the" + }, + { + "start": 3554.91, + "duration": 0.0, + "text": "later cuz I'm not sure I understand the" + }, + { + "start": 3554.92, + "duration": 0.0, + "text": "later cuz I'm not sure I understand the question,<00:59:15.320> but<00:59:15.480> I<00:59:15.520> think<00:59:15.720> that<00:59:16.120> that's<00:59:16.320> an" + }, + { + "start": 3556.43, + "duration": 0.0, + "text": "question, but I think that that's an" + }, + { + "start": 3556.44, + "duration": 0.0, + "text": "question, but I think that that's an interesting<00:59:16.800> set<00:59:16.920> of<00:59:16.960> questions.<00:59:17.680> Okay," + }, + { + "start": 3557.87, + "duration": 0.0, + "text": "interesting set of questions. Okay," + }, + { + "start": 3557.88, + "duration": 0.0, + "text": "interesting set of questions. Okay, good." + }, + { + "start": 3558.79, + "duration": 0.0, + "text": "good." + }, + { + "start": 3558.8, + "duration": 0.0, + "text": "good. Okay." + }, + { + "start": 3560.31, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 3560.32, + "duration": 0.0, + "text": "Okay. All<00:59:20.359> right." + }, + { + "start": 3561.79, + "duration": 0.0, + "text": "All right." + }, + { + "start": 3561.8, + "duration": 0.0, + "text": "All right. So,<00:59:22.440> um<00:59:23.000> you<00:59:23.080> know,<00:59:23.200> we're<00:59:23.359> we're<00:59:23.480> going" + }, + { + "start": 3563.75, + "duration": 0.0, + "text": "So, um you know, we're we're going" + }, + { + "start": 3563.76, + "duration": 0.0, + "text": "So, um you know, we're we're going through<00:59:23.960> really<00:59:24.240> the<00:59:25.080> the<00:59:25.359> the<00:59:25.640> low-level" + }, + { + "start": 3566.39, + "duration": 0.0, + "text": "through really the the the low-level" + }, + { + "start": 3566.4, + "duration": 0.0, + "text": "through really the the the low-level lowest<00:59:26.760> levels<00:59:27.080> of<00:59:27.200> details<00:59:27.680> of<00:59:27.800> language" + }, + { + "start": 3568.11, + "duration": 0.0, + "text": "lowest levels of details of language" + }, + { + "start": 3568.12, + "duration": 0.0, + "text": "lowest levels of details of language modeling,<00:59:28.440> which<00:59:28.560> I<00:59:28.600> think<00:59:28.760> has<00:59:28.880> really" + }, + { + "start": 3569.67, + "duration": 0.0, + "text": "modeling, which I think has really" + }, + { + "start": 3569.68, + "duration": 0.0, + "text": "modeling, which I think has really exposed<00:59:30.240> a<00:59:30.280> lot<00:59:30.440> of<00:59:30.520> interesting<00:59:30.960> ideas<00:59:31.520> while" + }, + { + "start": 3571.67, + "duration": 0.0, + "text": "exposed a lot of interesting ideas while" + }, + { + "start": 3571.68, + "duration": 0.0, + "text": "exposed a lot of interesting ideas while we<00:59:31.800> sort<00:59:31.960> of<00:59:32.080> talk<00:59:32.320> through<00:59:32.520> this.<00:59:33.240> And<00:59:33.320> I" + }, + { + "start": 3573.35, + "duration": 0.0, + "text": "we sort of talk through this. And I" + }, + { + "start": 3573.36, + "duration": 0.0, + "text": "we sort of talk through this. And I think<00:59:33.600> dropout<00:59:34.120> is<00:59:34.240> one<00:59:34.400> of<00:59:34.480> the<00:59:34.720> end" + }, + { + "start": 3574.91, + "duration": 0.0, + "text": "think dropout is one of the end" + }, + { + "start": 3574.92, + "duration": 0.0, + "text": "think dropout is one of the end regularization,<00:59:36.040> I<00:59:36.080> think<00:59:36.240> is<00:59:36.360> another<00:59:37.160> very" + }, + { + "start": 3577.59, + "duration": 0.0, + "text": "regularization, I think is another very" + }, + { + "start": 3577.6, + "duration": 0.0, + "text": "regularization, I think is another very interesting<00:59:38.080> class<00:59:38.480> of<00:59:38.600> ideas.<00:59:39.480> Also<00:59:39.760> one" + }, + { + "start": 3579.91, + "duration": 0.0, + "text": "interesting class of ideas. Also one" + }, + { + "start": 3579.92, + "duration": 0.0, + "text": "interesting class of ideas. Also one that<00:59:40.040> I<00:59:40.080> think<00:59:40.280> is<00:59:40.360> very<00:59:40.560> counterintuitive" + }, + { + "start": 3581.95, + "duration": 0.0, + "text": "that I think is very counterintuitive" + }, + { + "start": 3581.96, + "duration": 0.0, + "text": "that I think is very counterintuitive from<00:59:42.160> your<00:59:42.280> machine<00:59:42.640> learning<00:59:42.920> 101" + }, + { + "start": 3583.35, + "duration": 0.0, + "text": "from your machine learning 101" + }, + { + "start": 3583.36, + "duration": 0.0, + "text": "from your machine learning 101 intuition." + }, + { + "start": 3584.71, + "duration": 0.0, + "text": "intuition." + }, + { + "start": 3584.72, + "duration": 0.0, + "text": "intuition. So,<00:59:45.320> let's<00:59:45.600> uh" + }, + { + "start": 3586.15, + "duration": 0.0, + "text": "So, let's uh" + }, + { + "start": 3586.16, + "duration": 0.0, + "text": "So, let's uh go<00:59:46.360> through<00:59:46.720> what<00:59:46.880> I<00:59:46.920> think<00:59:47.120> is<00:59:47.240> like<00:59:47.440> the<00:59:47.960> the" + }, + { + "start": 3588.07, + "duration": 0.0, + "text": "go through what I think is like the the" + }, + { + "start": 3588.08, + "duration": 0.0, + "text": "go through what I think is like the the standard<00:59:48.640> argument<00:59:49.280> for,<00:59:49.720> you<00:59:49.840> know," + }, + { + "start": 3590.27, + "duration": 0.0, + "text": "standard argument for, you know," + }, + { + "start": 3590.28, + "duration": 0.0, + "text": "standard argument for, you know, regularization.<00:59:51.520> Um<00:59:51.840> well,<00:59:52.160> if<00:59:52.280> I'm<00:59:52.400> doing" + }, + { + "start": 3592.63, + "duration": 0.0, + "text": "regularization. Um well, if I'm doing" + }, + { + "start": 3592.64, + "duration": 0.0, + "text": "regularization. Um well, if I'm doing language<00:59:53.040> modeling,<00:59:53.440> I<00:59:53.480> have<00:59:53.600> a<00:59:53.640> lot<00:59:53.880> of<00:59:53.960> data," + }, + { + "start": 3594.39, + "duration": 0.0, + "text": "language modeling, I have a lot of data," + }, + { + "start": 3594.4, + "duration": 0.0, + "text": "language modeling, I have a lot of data, right?<00:59:54.560> I<00:59:54.600> have<00:59:54.760> more<00:59:55.040> data<00:59:55.320> than<00:59:55.480> I<00:59:55.560> can" + }, + { + "start": 3595.71, + "duration": 0.0, + "text": "right? I have more data than I can" + }, + { + "start": 3595.72, + "duration": 0.0, + "text": "right? I have more data than I can process<00:59:56.200> most<00:59:56.480> of<00:59:56.560> the<00:59:56.680> time,<00:59:57.160> right?<00:59:57.480> Unless" + }, + { + "start": 3597.75, + "duration": 0.0, + "text": "process most of the time, right? Unless" + }, + { + "start": 3597.76, + "duration": 0.0, + "text": "process most of the time, right? Unless you're<00:59:57.960> at,<00:59:58.480> you<00:59:58.560> know,<00:59:58.680> Google,<00:59:59.000> maybe<00:59:59.280> even" + }, + { + "start": 3599.51, + "duration": 0.0, + "text": "you're at, you know, Google, maybe even" + }, + { + "start": 3599.52, + "duration": 0.0, + "text": "you're at, you know, Google, maybe even then,<01:00:00.160> there<01:00:00.320> is<01:00:00.480> more<01:00:00.880> internet<01:00:01.320> data<01:00:01.640> than" + }, + { + "start": 3601.75, + "duration": 0.0, + "text": "then, there is more internet data than" + }, + { + "start": 3601.76, + "duration": 0.0, + "text": "then, there is more internet data than there<01:00:01.960> is<01:00:02.120> flops.<01:00:03.000> So,<01:00:03.760> you<01:00:03.880> know,<01:00:03.960> I'm" + }, + { + "start": 3604.07, + "duration": 0.0, + "text": "there is flops. So, you know, I'm" + }, + { + "start": 3604.08, + "duration": 0.0, + "text": "there is flops. So, you know, I'm probably<01:00:04.400> not<01:00:04.600> even<01:00:04.720> going<01:00:04.840> to<01:00:04.880> see<01:00:05.000> the<01:00:05.080> same" + }, + { + "start": 3605.31, + "duration": 0.0, + "text": "probably not even going to see the same" + }, + { + "start": 3605.32, + "duration": 0.0, + "text": "probably not even going to see the same data<01:00:05.520> twice,<01:00:06.240> right?<01:00:07.120> Um<01:00:08.000> so,<01:00:08.280> I'm<01:00:08.480> only<01:00:08.680> going" + }, + { + "start": 3608.79, + "duration": 0.0, + "text": "data twice, right? Um so, I'm only going" + }, + { + "start": 3608.8, + "duration": 0.0, + "text": "data twice, right? Um so, I'm only going to<01:00:08.840> do<01:00:08.960> a<01:00:09.040> single<01:00:09.480> pass<01:00:09.760> on<01:00:09.840> a<01:00:09.880> corpus,<01:00:10.360> and" + }, + { + "start": 3610.43, + "duration": 0.0, + "text": "to do a single pass on a corpus, and" + }, + { + "start": 3610.44, + "duration": 0.0, + "text": "to do a single pass on a corpus, and there's<01:00:10.640> very<01:00:11.000> good<01:00:11.560> reasons<01:00:12.040> and<01:00:12.200> arguments" + }, + { + "start": 3612.67, + "duration": 0.0, + "text": "there's very good reasons and arguments" + }, + { + "start": 3612.68, + "duration": 0.0, + "text": "there's very good reasons and arguments to<01:00:12.800> believe<01:00:13.120> that<01:00:13.280> a<01:00:13.320> single<01:00:13.720> pass<01:00:14.040> of<01:00:14.120> SGD<01:00:14.720> or" + }, + { + "start": 3614.87, + "duration": 0.0, + "text": "to believe that a single pass of SGD or" + }, + { + "start": 3614.88, + "duration": 0.0, + "text": "to believe that a single pass of SGD or other<01:00:15.120> optimizers<01:00:16.160> is<01:00:16.320> never<01:00:16.680> really<01:00:16.960> going" + }, + { + "start": 3617.19, + "duration": 0.0, + "text": "other optimizers is never really going" + }, + { + "start": 3617.2, + "duration": 0.0, + "text": "other optimizers is never really going to<01:00:17.320> memorize<01:00:17.840> my<01:00:17.960> data<01:00:18.280> very<01:00:18.560> much,<01:00:18.960> right?" + }, + { + "start": 3619.55, + "duration": 0.0, + "text": "to memorize my data very much, right?" + }, + { + "start": 3619.56, + "duration": 0.0, + "text": "to memorize my data very much, right? So,<01:00:19.720> this<01:00:19.920> means<01:00:20.520> overfitting<01:00:21.200> is<01:00:21.320> not<01:00:21.600> really" + }, + { + "start": 3621.91, + "duration": 0.0, + "text": "So, this means overfitting is not really" + }, + { + "start": 3621.92, + "duration": 0.0, + "text": "So, this means overfitting is not really a<01:00:22.000> problem<01:00:22.920> uh<01:00:23.160> almost<01:00:23.560> ever<01:00:23.840> during<01:00:24.360> compute" + }, + { + "start": 3624.75, + "duration": 0.0, + "text": "a problem uh almost ever during compute" + }, + { + "start": 3624.76, + "duration": 0.0, + "text": "a problem uh almost ever during compute constrained<01:00:25.200> language<01:00:25.560> modeling." + }, + { + "start": 3626.91, + "duration": 0.0, + "text": "constrained language modeling." + }, + { + "start": 3626.92, + "duration": 0.0, + "text": "constrained language modeling. Now,<01:00:27.680> you<01:00:27.760> know,<01:00:27.880> some<01:00:28.040> people<01:00:28.280> even<01:00:28.480> actually" + }, + { + "start": 3629.15, + "duration": 0.0, + "text": "Now, you know, some people even actually" + }, + { + "start": 3629.16, + "duration": 0.0, + "text": "Now, you know, some people even actually only<01:00:29.440> look<01:00:29.600> at<01:00:29.720> training<01:00:30.240> loss<01:00:30.680> because<01:00:30.960> they" + }, + { + "start": 3631.07, + "duration": 0.0, + "text": "only look at training loss because they" + }, + { + "start": 3631.08, + "duration": 0.0, + "text": "only look at training loss because they believe<01:00:31.360> so<01:00:31.560> strongly<01:00:32.080> that<01:00:32.280> overfitting" + }, + { + "start": 3632.99, + "duration": 0.0, + "text": "believe so strongly that overfitting" + }, + { + "start": 3633.0, + "duration": 0.0, + "text": "believe so strongly that overfitting doesn't<01:00:33.320> happen<01:00:33.600> in<01:00:33.720> single<01:00:33.960> pass<01:00:34.240> SGD." + }, + { + "start": 3635.35, + "duration": 0.0, + "text": "doesn't happen in single pass SGD." + }, + { + "start": 3635.36, + "duration": 0.0, + "text": "doesn't happen in single pass SGD. Now,<01:00:35.520> given<01:00:35.880> this,<01:00:36.360> you<01:00:36.440> know,<01:00:36.560> you<01:00:36.640> can<01:00:36.760> sort" + }, + { + "start": 3636.91, + "duration": 0.0, + "text": "Now, given this, you know, you can sort" + }, + { + "start": 3636.92, + "duration": 0.0, + "text": "Now, given this, you know, you can sort of<01:00:37.040> sit<01:00:37.280> and<01:00:37.400> think<01:00:37.600> about<01:00:37.840> this.<01:00:38.640> Should<01:00:39.000> I" + }, + { + "start": 3639.15, + "duration": 0.0, + "text": "of sit and think about this. Should I" + }, + { + "start": 3639.16, + "duration": 0.0, + "text": "of sit and think about this. Should I use<01:00:39.400> dropout<01:00:40.360> or<01:00:40.480> weight<01:00:40.760> decay<01:00:41.160> in<01:00:41.360> language" + }, + { + "start": 3641.67, + "duration": 0.0, + "text": "use dropout or weight decay in language" + }, + { + "start": 3641.68, + "duration": 0.0, + "text": "use dropout or weight decay in language model<01:00:41.920> training,<01:00:42.560> right?" + }, + { + "start": 3643.75, + "duration": 0.0, + "text": "model training, right?" + }, + { + "start": 3643.76, + "duration": 0.0, + "text": "model training, right? Okay,<01:00:44.080> you<01:00:44.160> can<01:00:44.280> think<01:00:44.400> about<01:00:44.640> it<01:00:44.720> a<01:00:44.760> bit." + }, + { + "start": 3646.43, + "duration": 0.0, + "text": "Okay, you can think about it a bit." + }, + { + "start": 3646.44, + "duration": 0.0, + "text": "Okay, you can think about it a bit. Um" + }, + { + "start": 3647.83, + "duration": 0.0, + "text": "Um" + }, + { + "start": 3647.84, + "duration": 0.0, + "text": "Um you<01:00:47.920> know,<01:00:48.080> one<01:00:48.320> unfortunate<01:00:48.840> thing<01:00:49.160> is<01:00:49.320> that" + }, + { + "start": 3649.59, + "duration": 0.0, + "text": "you know, one unfortunate thing is that" + }, + { + "start": 3649.6, + "duration": 0.0, + "text": "you know, one unfortunate thing is that a<01:00:49.640> lot<01:00:49.880> of<01:00:49.960> recent<01:00:50.280> models<01:00:50.600> don't<01:00:50.840> talk<01:00:51.040> about" + }, + { + "start": 3651.27, + "duration": 0.0, + "text": "a lot of recent models don't talk about" + }, + { + "start": 3651.28, + "duration": 0.0, + "text": "a lot of recent models don't talk about this<01:00:51.400> stuff<01:00:51.600> at<01:00:51.720> all.<01:00:52.160> Um<01:00:52.440> it's<01:00:52.640> really" + }, + { + "start": 3653.67, + "duration": 0.0, + "text": "this stuff at all. Um it's really" + }, + { + "start": 3653.68, + "duration": 0.0, + "text": "this stuff at all. Um it's really lower-level<01:00:54.400> details<01:00:55.000> than<01:00:55.240> like<01:00:55.680> tech" + }, + { + "start": 3655.87, + "duration": 0.0, + "text": "lower-level details than like tech" + }, + { + "start": 3655.88, + "duration": 0.0, + "text": "lower-level details than like tech reports<01:00:56.240> are<01:00:56.320> willing<01:00:56.560> to<01:00:56.640> expose." + }, + { + "start": 3657.83, + "duration": 0.0, + "text": "reports are willing to expose." + }, + { + "start": 3657.84, + "duration": 0.0, + "text": "reports are willing to expose. Um<01:00:58.400> but<01:00:58.560> if<01:00:58.680> you<01:00:58.840> look,<01:00:59.600> actually<01:01:00.160> you<01:01:00.280> find<01:01:00.520> a" + }, + { + "start": 3660.55, + "duration": 0.0, + "text": "Um but if you look, actually you find a" + }, + { + "start": 3660.56, + "duration": 0.0, + "text": "Um but if you look, actually you find a lot<01:01:00.840> of<01:01:00.920> models<01:01:01.880> um<01:01:02.160> do<01:01:02.320> both.<01:01:03.080> Especially" + }, + { + "start": 3663.67, + "duration": 0.0, + "text": "lot of models um do both. Especially" + }, + { + "start": 3663.68, + "duration": 0.0, + "text": "lot of models um do both. Especially weight<01:01:03.920> decay<01:01:04.240> actually<01:01:04.640> is<01:01:04.760> a<01:01:04.800> fairly" + }, + { + "start": 3665.43, + "duration": 0.0, + "text": "weight decay actually is a fairly" + }, + { + "start": 3665.44, + "duration": 0.0, + "text": "weight decay actually is a fairly popular<01:01:06.120> intervention<01:01:07.200> even<01:01:07.520> for<01:01:07.720> modern" + }, + { + "start": 3668.79, + "duration": 0.0, + "text": "popular intervention even for modern" + }, + { + "start": 3668.8, + "duration": 0.0, + "text": "popular intervention even for modern high-performance<01:01:09.920> language<01:01:10.360> models.<01:01:11.360> Um" + }, + { + "start": 3671.91, + "duration": 0.0, + "text": "high-performance language models. Um" + }, + { + "start": 3671.92, + "duration": 0.0, + "text": "high-performance language models. Um this<01:01:12.120> is<01:01:12.320> very,<01:01:12.600> very<01:01:12.880> surprising,<01:01:13.800> you<01:01:13.880> know?" + }, + { + "start": 3674.15, + "duration": 0.0, + "text": "this is very, very surprising, you know?" + }, + { + "start": 3674.16, + "duration": 0.0, + "text": "this is very, very surprising, you know? I<01:01:14.240> mean,<01:01:14.720> some<01:01:14.880> of<01:01:14.960> the<01:01:15.040> dropout<01:01:15.640> things<01:01:16.640> um" + }, + { + "start": 3677.23, + "duration": 0.0, + "text": "I mean, some of the dropout things um" + }, + { + "start": 3677.24, + "duration": 0.0, + "text": "I mean, some of the dropout things um you<01:01:17.320> know,<01:01:17.440> maybe" + }, + { + "start": 3679.07, + "duration": 0.0, + "text": "you know, maybe" + }, + { + "start": 3679.08, + "duration": 0.0, + "text": "you know, maybe uh<01:01:19.680> have<01:01:19.880> gone<01:01:20.160> out<01:01:20.280> of<01:01:20.360> favor,<01:01:21.040> but<01:01:21.200> weight" + }, + { + "start": 3681.39, + "duration": 0.0, + "text": "uh have gone out of favor, but weight" + }, + { + "start": 3681.4, + "duration": 0.0, + "text": "uh have gone out of favor, but weight decay<01:01:21.600> actually<01:01:22.000> remains<01:01:22.520> fairly<01:01:22.880> popular." + }, + { + "start": 3683.35, + "duration": 0.0, + "text": "decay actually remains fairly popular." + }, + { + "start": 3683.36, + "duration": 0.0, + "text": "decay actually remains fairly popular. And<01:01:23.440> this<01:01:23.600> is<01:01:23.760> very<01:01:24.000> mystifying.<01:01:24.600> Like,<01:01:24.760> why" + }, + { + "start": 3684.99, + "duration": 0.0, + "text": "And this is very mystifying. Like, why" + }, + { + "start": 3685.0, + "duration": 0.0, + "text": "And this is very mystifying. Like, why is<01:01:25.200> this?" + }, + { + "start": 3686.23, + "duration": 0.0, + "text": "is this?" + }, + { + "start": 3686.24, + "duration": 0.0, + "text": "is this? Um<01:01:26.640> and<01:01:26.760> this<01:01:26.920> is,<01:01:27.280> you<01:01:27.360> know,<01:01:27.480> one<01:01:27.640> of<01:01:27.680> the" + }, + { + "start": 3687.79, + "duration": 0.0, + "text": "Um and this is, you know, one of the" + }, + { + "start": 3687.8, + "duration": 0.0, + "text": "Um and this is, you know, one of the reasons<01:01:28.120> why<01:01:28.200> I<01:01:28.240> think<01:01:28.480> deep<01:01:28.680> learning<01:01:28.960> is" + }, + { + "start": 3689.11, + "duration": 0.0, + "text": "reasons why I think deep learning is" + }, + { + "start": 3689.12, + "duration": 0.0, + "text": "reasons why I think deep learning is hard<01:01:29.560> and<01:01:29.640> this<01:01:29.800> architecture<01:01:30.560> lecture<01:01:30.960> is" + }, + { + "start": 3691.07, + "duration": 0.0, + "text": "hard and this architecture lecture is" + }, + { + "start": 3691.08, + "duration": 0.0, + "text": "hard and this architecture lecture is very<01:01:31.280> strange<01:01:31.640> and<01:01:31.760> hard.<01:01:32.680> Um<01:01:33.160> it's<01:01:33.360> because" + }, + { + "start": 3693.67, + "duration": 0.0, + "text": "very strange and hard. Um it's because" + }, + { + "start": 3693.68, + "duration": 0.0, + "text": "very strange and hard. Um it's because these<01:01:33.800> things<01:01:34.040> interact<01:01:34.560> in<01:01:34.720> very<01:01:35.000> strange" + }, + { + "start": 3695.47, + "duration": 0.0, + "text": "these things interact in very strange" + }, + { + "start": 3695.48, + "duration": 0.0, + "text": "these things interact in very strange ways.<01:01:36.040> So,<01:01:36.680> there<01:01:36.800> have<01:01:36.960> been<01:01:37.120> papers<01:01:37.880> that" + }, + { + "start": 3697.99, + "duration": 0.0, + "text": "ways. So, there have been papers that" + }, + { + "start": 3698.0, + "duration": 0.0, + "text": "ways. So, there have been papers that have<01:01:38.160> argued<01:01:39.000> um<01:01:39.360> and<01:01:39.560> shown<01:01:39.840> nice<01:01:40.040> evidence" + }, + { + "start": 3701.07, + "duration": 0.0, + "text": "have argued um and shown nice evidence" + }, + { + "start": 3701.08, + "duration": 0.0, + "text": "have argued um and shown nice evidence that<01:01:41.840> weight<01:01:42.160> decay<01:01:42.680> is<01:01:42.880> actually<01:01:43.240> not<01:01:43.680> a" + }, + { + "start": 3703.79, + "duration": 0.0, + "text": "that weight decay is actually not a" + }, + { + "start": 3703.8, + "duration": 0.0, + "text": "that weight decay is actually not a regularizer<01:01:44.720> sometimes.<01:01:45.360> It<01:01:45.480> actually" + }, + { + "start": 3706.47, + "duration": 0.0, + "text": "regularizer sometimes. It actually" + }, + { + "start": 3706.48, + "duration": 0.0, + "text": "regularizer sometimes. It actually interacts<01:01:47.400> with<01:01:47.680> the<01:01:47.840> optimizer<01:01:49.080> to" + }, + { + "start": 3709.95, + "duration": 0.0, + "text": "interacts with the optimizer to" + }, + { + "start": 3709.96, + "duration": 0.0, + "text": "interacts with the optimizer to essentially<01:01:51.200> um" + }, + { + "start": 3711.79, + "duration": 0.0, + "text": "essentially um" + }, + { + "start": 3711.8, + "duration": 0.0, + "text": "essentially um make<01:01:52.280> optimization<01:01:53.040> better.<01:01:53.680> Um<01:01:54.480> so,<01:01:54.680> if<01:01:54.840> you" + }, + { + "start": 3714.99, + "duration": 0.0, + "text": "make optimization better. Um so, if you" + }, + { + "start": 3715.0, + "duration": 0.0, + "text": "make optimization better. Um so, if you look<01:01:55.440> at<01:01:55.720> the<01:01:55.840> training<01:01:56.520> versus<01:01:56.880> validation" + }, + { + "start": 3717.55, + "duration": 0.0, + "text": "look at the training versus validation" + }, + { + "start": 3717.56, + "duration": 0.0, + "text": "look at the training versus validation loss<01:01:57.840> across<01:01:58.200> different<01:01:58.560> weight<01:01:58.800> decay" + }, + { + "start": 3719.15, + "duration": 0.0, + "text": "loss across different weight decay" + }, + { + "start": 3719.16, + "duration": 0.0, + "text": "loss across different weight decay settings<01:01:59.680> on,<01:02:00.200> you<01:02:00.320> know,<01:02:00.440> language<01:02:00.800> model" + }, + { + "start": 3721.11, + "duration": 0.0, + "text": "settings on, you know, language model" + }, + { + "start": 3721.12, + "duration": 0.0, + "text": "settings on, you know, language model training<01:02:01.600> for<01:02:01.760> single<01:02:02.040> pass<01:02:02.280> SGD,<01:02:02.960> you<01:02:03.040> don't" + }, + { + "start": 3723.19, + "duration": 0.0, + "text": "training for single pass SGD, you don't" + }, + { + "start": 3723.2, + "duration": 0.0, + "text": "training for single pass SGD, you don't really<01:02:03.400> see<01:02:03.560> any<01:02:03.680> difference.<01:02:04.080> Like,<01:02:04.240> weight" + }, + { + "start": 3724.47, + "duration": 0.0, + "text": "really see any difference. Like, weight" + }, + { + "start": 3724.48, + "duration": 0.0, + "text": "really see any difference. Like, weight decay<01:02:04.760> isn't<01:02:05.120> shifting<01:02:05.560> things<01:02:05.880> so<01:02:05.960> the" + }, + { + "start": 3726.07, + "duration": 0.0, + "text": "decay isn't shifting things so the" + }, + { + "start": 3726.08, + "duration": 0.0, + "text": "decay isn't shifting things so the validation<01:02:06.640> loss<01:02:06.800> is<01:02:06.920> better.<01:02:07.160> There's" + }, + { + "start": 3727.35, + "duration": 0.0, + "text": "validation loss is better. There's" + }, + { + "start": 3727.36, + "duration": 0.0, + "text": "validation loss is better. There's already<01:02:07.800> no<01:02:07.960> overfitting.<01:02:08.440> We're<01:02:08.520> on<01:02:08.600> the<01:02:08.680> x" + }, + { + "start": 3728.87, + "duration": 0.0, + "text": "already no overfitting. We're on the x" + }, + { + "start": 3728.88, + "duration": 0.0, + "text": "already no overfitting. We're on the x equals<01:02:09.160> y<01:02:09.360> line<01:02:09.600> here,<01:02:09.760> right?<01:02:10.600> So,<01:02:10.880> doesn't" + }, + { + "start": 3731.27, + "duration": 0.0, + "text": "equals y line here, right? So, doesn't" + }, + { + "start": 3731.28, + "duration": 0.0, + "text": "equals y line here, right? So, doesn't control<01:02:11.600> overfitting," + }, + { + "start": 3732.95, + "duration": 0.0, + "text": "control overfitting," + }, + { + "start": 3732.96, + "duration": 0.0, + "text": "control overfitting, but<01:02:13.720> if<01:02:13.840> we<01:02:13.960> kind<01:02:14.280> of<01:02:14.440> look<01:02:14.960> at<01:02:15.160> different" + }, + { + "start": 3735.55, + "duration": 0.0, + "text": "but if we kind of look at different" + }, + { + "start": 3735.56, + "duration": 0.0, + "text": "but if we kind of look at different levels<01:02:15.920> of<01:02:16.120> weight<01:02:16.400> decay,<01:02:17.240> and<01:02:17.560> not<01:02:17.760> only" + }, + { + "start": 3737.99, + "duration": 0.0, + "text": "levels of weight decay, and not only" + }, + { + "start": 3738.0, + "duration": 0.0, + "text": "levels of weight decay, and not only just<01:02:18.200> different<01:02:18.480> levels<01:02:18.720> of<01:02:18.840> weight<01:02:19.080> decay," + }, + { + "start": 3739.43, + "duration": 0.0, + "text": "just different levels of weight decay," + }, + { + "start": 3739.44, + "duration": 0.0, + "text": "just different levels of weight decay, we<01:02:19.600> look<01:02:19.800> at<01:02:19.960> weight<01:02:20.200> decay<01:02:20.480> combined<01:02:21.760> with" + }, + { + "start": 3742.47, + "duration": 0.0, + "text": "we look at weight decay combined with" + }, + { + "start": 3742.48, + "duration": 0.0, + "text": "we look at weight decay combined with learning<01:02:22.960> rate<01:02:23.200> decay,<01:02:24.040> um<01:02:24.480> what<01:02:24.680> we<01:02:24.840> find<01:02:25.240> is" + }, + { + "start": 3745.35, + "duration": 0.0, + "text": "learning rate decay, um what we find is" + }, + { + "start": 3745.36, + "duration": 0.0, + "text": "learning rate decay, um what we find is that<01:02:25.520> the<01:02:25.640> stronger<01:02:26.400> weight<01:02:26.640> decay<01:02:27.080> runs," + }, + { + "start": 3747.51, + "duration": 0.0, + "text": "that the stronger weight decay runs," + }, + { + "start": 3747.52, + "duration": 0.0, + "text": "that the stronger weight decay runs, these<01:02:27.760> blue<01:02:28.320> dash<01:02:28.760> lines<01:02:29.000> on<01:02:29.120> the<01:02:29.200> bottom,<01:02:30.000> you" + }, + { + "start": 3750.07, + "duration": 0.0, + "text": "these blue dash lines on the bottom, you" + }, + { + "start": 3750.08, + "duration": 0.0, + "text": "these blue dash lines on the bottom, you know,<01:02:30.200> do<01:02:30.360> significantly<01:02:31.080> better<01:02:31.920> because" + }, + { + "start": 3752.19, + "duration": 0.0, + "text": "know, do significantly better because" + }, + { + "start": 3752.2, + "duration": 0.0, + "text": "know, do significantly better because they<01:02:32.560> start<01:02:32.920> out<01:02:33.160> slow,<01:02:33.840> but<01:02:34.040> they" + }, + { + "start": 3754.79, + "duration": 0.0, + "text": "they start out slow, but they" + }, + { + "start": 3754.8, + "duration": 0.0, + "text": "they start out slow, but they essentially<01:02:35.520> end<01:02:35.760> up<01:02:36.320> um<01:02:36.560> converging<01:02:37.320> to<01:02:37.440> a" + }, + { + "start": 3757.51, + "duration": 0.0, + "text": "essentially end up um converging to a" + }, + { + "start": 3757.52, + "duration": 0.0, + "text": "essentially end up um converging to a much<01:02:37.800> better<01:02:38.040> minimum<01:02:38.760> later.<01:02:39.120> And<01:02:39.280> this<01:02:39.480> is," + }, + { + "start": 3759.63, + "duration": 0.0, + "text": "much better minimum later. And this is," + }, + { + "start": 3759.64, + "duration": 0.0, + "text": "much better minimum later. And this is, you<01:02:39.720> know,<01:02:40.200> generally<01:02:40.680> true<01:02:40.880> when<01:02:41.000> we<01:02:41.080> decay" + }, + { + "start": 3761.47, + "duration": 0.0, + "text": "you know, generally true when we decay" + }, + { + "start": 3761.48, + "duration": 0.0, + "text": "you know, generally true when we decay learning<01:02:41.840> rate,<01:02:42.400> not<01:02:42.880> necessarily<01:02:43.360> true<01:02:44.000> when" + }, + { + "start": 3764.15, + "duration": 0.0, + "text": "learning rate, not necessarily true when" + }, + { + "start": 3764.16, + "duration": 0.0, + "text": "learning rate, not necessarily true when we're<01:02:44.280> in<01:02:44.400> constant<01:02:44.960> learning<01:02:45.240> rate,<01:02:45.400> which" + }, + { + "start": 3765.51, + "duration": 0.0, + "text": "we're in constant learning rate, which" + }, + { + "start": 3765.52, + "duration": 0.0, + "text": "we're in constant learning rate, which is<01:02:45.640> maybe<01:02:46.280> somewhat<01:02:46.760> more<01:02:47.040> of<01:02:47.160> where<01:02:47.280> your" + }, + { + "start": 3767.43, + "duration": 0.0, + "text": "is maybe somewhat more of where your" + }, + { + "start": 3767.44, + "duration": 0.0, + "text": "is maybe somewhat more of where your intuition<01:02:47.840> is<01:02:47.960> coming<01:02:48.280> from." + }, + { + "start": 3769.43, + "duration": 0.0, + "text": "intuition is coming from." + }, + { + "start": 3769.44, + "duration": 0.0, + "text": "intuition is coming from. So,<01:02:50.120> you<01:02:50.200> know,<01:02:50.320> this<01:02:50.480> is<01:02:50.640> part<01:02:50.960> of<01:02:51.080> why<01:02:51.280> it's" + }, + { + "start": 3771.39, + "duration": 0.0, + "text": "So, you know, this is part of why it's" + }, + { + "start": 3771.4, + "duration": 0.0, + "text": "So, you know, this is part of why it's very<01:02:51.640> difficult<01:02:52.200> to<01:02:52.400> reason<01:02:53.160> sort<01:02:53.320> of<01:02:53.480> a" + }, + { + "start": 3773.59, + "duration": 0.0, + "text": "very difficult to reason sort of a" + }, + { + "start": 3773.6, + "duration": 0.0, + "text": "very difficult to reason sort of a priori<01:02:54.240> or<01:02:54.360> like<01:02:54.560> from<01:02:54.720> scratch,<01:02:55.720> you<01:02:55.760> know," + }, + { + "start": 3775.83, + "duration": 0.0, + "text": "priori or like from scratch, you know," + }, + { + "start": 3775.84, + "duration": 0.0, + "text": "priori or like from scratch, you know, the<01:02:55.960> behavior<01:02:56.520> of<01:02:56.680> all<01:02:56.840> these<01:02:57.040> different" + }, + { + "start": 3777.35, + "duration": 0.0, + "text": "the behavior of all these different" + }, + { + "start": 3777.36, + "duration": 0.0, + "text": "the behavior of all these different choices<01:02:57.920> and<01:02:58.080> why,<01:02:58.760> you<01:02:58.880> know,<01:02:59.000> I<01:02:59.080> think<01:02:59.280> Percy" + }, + { + "start": 3779.63, + "duration": 0.0, + "text": "choices and why, you know, I think Percy" + }, + { + "start": 3779.64, + "duration": 0.0, + "text": "choices and why, you know, I think Percy and<01:02:59.800> I<01:02:59.880> have<01:03:00.000> designed<01:03:00.400> this<01:03:00.560> class<01:03:00.880> so<01:03:00.960> that" + }, + { + "start": 3781.15, + "duration": 0.0, + "text": "and I have designed this class so that" + }, + { + "start": 3781.16, + "duration": 0.0, + "text": "and I have designed this class so that you<01:03:01.280> interact<01:03:01.800> with<01:03:01.960> stuff<01:03:02.760> because,<01:03:03.240> you" + }, + { + "start": 3783.31, + "duration": 0.0, + "text": "you interact with stuff because, you" + }, + { + "start": 3783.32, + "duration": 0.0, + "text": "you interact with stuff because, you know,<01:03:03.440> you<01:03:03.560> might<01:03:04.200> come<01:03:04.400> upon<01:03:04.680> this<01:03:04.840> thing" + }, + { + "start": 3785.07, + "duration": 0.0, + "text": "know, you might come upon this thing" + }, + { + "start": 3785.08, + "duration": 0.0, + "text": "know, you might come upon this thing that<01:03:05.400> where<01:03:05.520> basically<01:03:06.240> weight<01:03:06.480> decay<01:03:06.680> is" + }, + { + "start": 3786.79, + "duration": 0.0, + "text": "that where basically weight decay is" + }, + { + "start": 3786.8, + "duration": 0.0, + "text": "that where basically weight decay is actually<01:03:07.040> an<01:03:07.120> optimization<01:03:07.800> intervention" + }, + { + "start": 3788.71, + "duration": 0.0, + "text": "actually an optimization intervention" + }, + { + "start": 3788.72, + "duration": 0.0, + "text": "actually an optimization intervention and<01:03:08.880> not<01:03:09.120> necessarily<01:03:10.160> a<01:03:10.800> regularization" + }, + { + "start": 3791.51, + "duration": 0.0, + "text": "and not necessarily a regularization" + }, + { + "start": 3791.52, + "duration": 0.0, + "text": "and not necessarily a regularization intervention,<01:03:11.960> which<01:03:12.080> is,<01:03:12.200> you<01:03:12.280> know,<01:03:12.400> what" + }, + { + "start": 3792.55, + "duration": 0.0, + "text": "intervention, which is, you know, what" + }, + { + "start": 3792.56, + "duration": 0.0, + "text": "intervention, which is, you know, what you<01:03:12.640> would<01:03:12.720> expect<01:03:13.200> here,<01:03:13.520> right?<01:03:13.720> So,<01:03:14.240> always" + }, + { + "start": 3794.51, + "duration": 0.0, + "text": "you would expect here, right? So, always" + }, + { + "start": 3794.52, + "duration": 0.0, + "text": "you would expect here, right? So, always keep<01:03:14.680> that<01:03:14.840> in<01:03:14.960> mind<01:03:15.200> that<01:03:15.520> these<01:03:15.800> kinds<01:03:16.240> of" + }, + { + "start": 3796.35, + "duration": 0.0, + "text": "keep that in mind that these kinds of" + }, + { + "start": 3796.36, + "duration": 0.0, + "text": "keep that in mind that these kinds of unexpected<01:03:16.960> effects<01:03:17.680> can<01:03:17.920> really<01:03:18.320> start<01:03:18.600> to" + }, + { + "start": 3798.67, + "duration": 0.0, + "text": "unexpected effects can really start to" + }, + { + "start": 3798.68, + "duration": 0.0, + "text": "unexpected effects can really start to kick<01:03:18.960> in" + }, + { + "start": 3799.95, + "duration": 0.0, + "text": "kick in" + }, + { + "start": 3799.96, + "duration": 0.0, + "text": "kick in uh<01:03:20.040> for<01:03:20.240> these<01:03:20.480> kinds<01:03:21.360> of<01:03:21.960> uh<01:03:22.080> settings." + }, + { + "start": 3804.23, + "duration": 0.0, + "text": "uh for these kinds of uh settings." + }, + { + "start": 3804.24, + "duration": 0.0, + "text": "uh for these kinds of uh settings. Cool." + }, + { + "start": 3806.19, + "duration": 0.0, + "text": "Cool." + }, + { + "start": 3806.2, + "duration": 0.0, + "text": "Cool. All<01:03:26.240> right.<01:03:26.760> So,<01:03:27.120> to<01:03:27.280> put<01:03:27.560> everything" + }, + { + "start": 3807.87, + "duration": 0.0, + "text": "All right. So, to put everything" + }, + { + "start": 3807.88, + "duration": 0.0, + "text": "All right. So, to put everything together<01:03:28.160> for<01:03:28.480> hyperparameters,<01:03:29.760> there's" + }, + { + "start": 3809.99, + "duration": 0.0, + "text": "together for hyperparameters, there's" + }, + { + "start": 3810.0, + "duration": 0.0, + "text": "together for hyperparameters, there's actually<01:03:30.520> for,<01:03:30.760> you<01:03:30.840> know,<01:03:31.000> a<01:03:31.040> lot<01:03:31.280> of<01:03:31.360> the<01:03:31.840> the" + }, + { + "start": 3811.91, + "duration": 0.0, + "text": "actually for, you know, a lot of the the" + }, + { + "start": 3811.92, + "duration": 0.0, + "text": "actually for, you know, a lot of the the maybe<01:03:32.160> more<01:03:32.360> hairy-looking" + }, + { + "start": 3813.11, + "duration": 0.0, + "text": "maybe more hairy-looking" + }, + { + "start": 3813.12, + "duration": 0.0, + "text": "maybe more hairy-looking hyperparameters,<01:03:34.320> actually<01:03:34.640> just<01:03:34.880> fairly" + }, + { + "start": 3815.23, + "duration": 0.0, + "text": "hyperparameters, actually just fairly" + }, + { + "start": 3815.24, + "duration": 0.0, + "text": "hyperparameters, actually just fairly standard<01:03:35.760> choices<01:03:36.160> that<01:03:36.280> have<01:03:36.480> worked<01:03:36.760> well" + }, + { + "start": 3816.99, + "duration": 0.0, + "text": "standard choices that have worked well" + }, + { + "start": 3817.0, + "duration": 0.0, + "text": "standard choices that have worked well for<01:03:37.200> everybody,<01:03:37.800> right?<01:03:38.400> You<01:03:38.480> know,<01:03:38.880> factor" + }, + { + "start": 3819.23, + "duration": 0.0, + "text": "for everybody, right? You know, factor" + }, + { + "start": 3819.24, + "duration": 0.0, + "text": "for everybody, right? You know, factor of<01:03:39.360> four<01:03:39.560> rule<01:03:39.760> of<01:03:39.880> thumb,<01:03:40.440> keep<01:03:40.680> your<01:03:40.840> head" + }, + { + "start": 3821.07, + "duration": 0.0, + "text": "of four rule of thumb, keep your head" + }, + { + "start": 3821.08, + "duration": 0.0, + "text": "of four rule of thumb, keep your head dim<01:03:41.280> and<01:03:41.360> your<01:03:41.480> number<01:03:41.840> of<01:03:41.960> heads<01:03:42.720> uh<01:03:42.840> equal<01:03:43.240> to" + }, + { + "start": 3823.35, + "duration": 0.0, + "text": "dim and your number of heads uh equal to" + }, + { + "start": 3823.36, + "duration": 0.0, + "text": "dim and your number of heads uh equal to the<01:03:43.440> model<01:03:43.720> dimension,<01:03:44.920> um<01:03:45.360> pick<01:03:45.520> an<01:03:45.640> aspect" + }, + { + "start": 3826.03, + "duration": 0.0, + "text": "the model dimension, um pick an aspect" + }, + { + "start": 3826.04, + "duration": 0.0, + "text": "the model dimension, um pick an aspect ratio<01:03:46.440> roughly<01:03:46.720> around<01:03:47.000> 100,<01:03:48.040> um" + }, + { + "start": 3828.75, + "duration": 0.0, + "text": "ratio roughly around 100, um" + }, + { + "start": 3828.76, + "duration": 0.0, + "text": "ratio roughly around 100, um and,<01:03:49.400> you<01:03:49.480> know,<01:03:49.560> if<01:03:49.720> you<01:03:49.840> ask<01:03:50.040> about" + }, + { + "start": 3830.19, + "duration": 0.0, + "text": "and, you know, if you ask about" + }, + { + "start": 3830.2, + "duration": 0.0, + "text": "and, you know, if you ask about regularization,<01:03:51.080> right?<01:03:51.680> You<01:03:51.840> want<01:03:52.000> to<01:03:52.080> maybe" + }, + { + "start": 3832.27, + "duration": 0.0, + "text": "regularization, right? You want to maybe" + }, + { + "start": 3832.28, + "duration": 0.0, + "text": "regularization, right? You want to maybe try<01:03:52.520> a<01:03:52.560> couple<01:03:52.840> things<01:03:53.080> cuz<01:03:53.240> regularization" + }, + { + "start": 3833.87, + "duration": 0.0, + "text": "try a couple things cuz regularization" + }, + { + "start": 3833.88, + "duration": 0.0, + "text": "try a couple things cuz regularization actually<01:03:54.240> does<01:03:55.120> interact<01:03:55.680> with<01:03:55.880> optimizers" + }, + { + "start": 3836.55, + "duration": 0.0, + "text": "actually does interact with optimizers" + }, + { + "start": 3836.56, + "duration": 0.0, + "text": "actually does interact with optimizers in<01:03:56.680> ways<01:03:56.920> that<01:03:57.040> are<01:03:57.120> quite<01:03:57.440> counterintuitive," + }, + { + "start": 3838.31, + "duration": 0.0, + "text": "in ways that are quite counterintuitive," + }, + { + "start": 3838.32, + "duration": 0.0, + "text": "in ways that are quite counterintuitive, right?<01:03:58.520> So,<01:03:58.600> this<01:03:58.760> is<01:03:58.840> the<01:03:58.920> thing<01:03:59.120> that<01:03:59.280> some" + }, + { + "start": 3839.43, + "duration": 0.0, + "text": "right? So, this is the thing that some" + }, + { + "start": 3839.44, + "duration": 0.0, + "text": "right? So, this is the thing that some people<01:04:00.120> uh<01:04:00.320> still<01:04:00.680> do<01:04:00.880> even<01:04:01.120> though<01:04:01.240> you<01:04:01.480> you" + }, + { + "start": 3841.59, + "duration": 0.0, + "text": "people uh still do even though you you" + }, + { + "start": 3841.6, + "duration": 0.0, + "text": "people uh still do even though you you don't<01:04:01.800> need<01:04:02.320> the<01:04:02.400> regularization<01:04:03.040> at<01:04:03.200> all." + }, + { + "start": 3844.27, + "duration": 0.0, + "text": "don't need the regularization at all." + }, + { + "start": 3844.28, + "duration": 0.0, + "text": "don't need the regularization at all. Actually,<01:04:04.600> maybe<01:04:04.760> I'll<01:04:04.840> stop<01:04:05.080> here<01:04:05.520> in<01:04:05.600> case" + }, + { + "start": 3845.75, + "duration": 0.0, + "text": "Actually, maybe I'll stop here in case" + }, + { + "start": 3845.76, + "duration": 0.0, + "text": "Actually, maybe I'll stop here in case yeah." + }, + { + "start": 3847.07, + "duration": 0.0, + "text": "yeah." + }, + { + "start": 3847.08, + "duration": 0.0, + "text": "yeah. Are<01:04:07.200> there<01:04:07.280> any<01:04:07.400> significant<01:04:07.920> differences" + }, + { + "start": 3849.19, + "duration": 0.0, + "text": "Are there any significant differences" + }, + { + "start": 3849.2, + "duration": 0.0, + "text": "Are there any significant differences maybe<01:04:09.480> for<01:04:09.640> like<01:04:10.040> um" + }, + { + "start": 3850.79, + "duration": 0.0, + "text": "maybe for like um" + }, + { + "start": 3850.8, + "duration": 0.0, + "text": "maybe for like um the<01:04:10.880> future<01:04:11.280> models?" + }, + { + "start": 3852.349, + "duration": 0.0, + "text": "the future models?" + }, + { + "start": 3852.359, + "duration": 0.0, + "text": "the future models? Ooh,<01:04:12.680> diffusions." + }, + { + "start": 3854.23, + "duration": 0.0, + "text": "Ooh, diffusions." + }, + { + "start": 3854.24, + "duration": 0.0, + "text": "Ooh, diffusions. That<01:04:14.720> I<01:04:14.800> have<01:04:15.000> not<01:04:15.320> looked<01:04:15.520> into<01:04:15.720> enough,<01:04:16.000> to" + }, + { + "start": 3856.07, + "duration": 0.0, + "text": "That I have not looked into enough, to" + }, + { + "start": 3856.08, + "duration": 0.0, + "text": "That I have not looked into enough, to be<01:04:16.200> honest.<01:04:16.760> Um<01:04:17.240> there<01:04:17.359> aren't<01:04:17.520> that<01:04:17.680> many" + }, + { + "start": 3857.87, + "duration": 0.0, + "text": "be honest. Um there aren't that many" + }, + { + "start": 3857.88, + "duration": 0.0, + "text": "be honest. Um there aren't that many people<01:04:18.120> training<01:04:18.480> big<01:04:18.640> diffusions<01:04:19.080> is<01:04:19.200> one" + }, + { + "start": 3859.349, + "duration": 0.0, + "text": "people training big diffusions is one" + }, + { + "start": 3859.359, + "duration": 0.0, + "text": "people training big diffusions is one issue.<01:04:20.160> Um<01:04:20.520> and<01:04:20.640> many<01:04:20.840> of<01:04:20.920> the<01:04:21.000> models<01:04:21.280> that" + }, + { + "start": 3861.39, + "duration": 0.0, + "text": "issue. Um and many of the models that" + }, + { + "start": 3861.4, + "duration": 0.0, + "text": "issue. Um and many of the models that have<01:04:21.520> been<01:04:21.640> trained<01:04:21.880> are<01:04:21.960> retrofitted<01:04:23.000> cuz<01:04:23.120> I" + }, + { + "start": 3863.15, + "duration": 0.0, + "text": "have been trained are retrofitted cuz I" + }, + { + "start": 3863.16, + "duration": 0.0, + "text": "have been trained are retrofitted cuz I think<01:04:23.359> the<01:04:23.480> architectures<01:04:23.960> are<01:04:24.040> actually<01:04:24.320> the" + }, + { + "start": 3864.39, + "duration": 0.0, + "text": "think the architectures are actually the" + }, + { + "start": 3864.4, + "duration": 0.0, + "text": "think the architectures are actually the same<01:04:24.720> as<01:04:24.880> the,<01:04:25.400> you<01:04:25.480> know,<01:04:25.560> like<01:04:25.720> a<01:04:25.760> Lama-like" + }, + { + "start": 3866.31, + "duration": 0.0, + "text": "same as the, you know, like a Lama-like" + }, + { + "start": 3866.32, + "duration": 0.0, + "text": "same as the, you know, like a Lama-like model.<01:04:27.080> Um<01:04:27.240> but<01:04:27.359> if<01:04:27.440> you're<01:04:27.560> asking<01:04:27.760> the" + }, + { + "start": 3867.83, + "duration": 0.0, + "text": "model. Um but if you're asking the" + }, + { + "start": 3867.84, + "duration": 0.0, + "text": "model. Um but if you're asking the question<01:04:28.120> of<01:04:28.160> like,<01:04:28.320> what's<01:04:28.560> the<01:04:28.920> optimal" + }, + { + "start": 3869.349, + "duration": 0.0, + "text": "question of like, what's the optimal" + }, + { + "start": 3869.359, + "duration": 0.0, + "text": "question of like, what's the optimal architecture<01:04:29.920> if<01:04:30.000> you<01:04:30.120> were<01:04:30.280> to<01:04:30.400> train<01:04:30.640> from" + }, + { + "start": 3870.79, + "duration": 0.0, + "text": "architecture if you were to train from" + }, + { + "start": 3870.8, + "duration": 0.0, + "text": "architecture if you were to train from scratch,<01:04:31.240> I<01:04:31.320> don't<01:04:31.600> know<01:04:31.720> what<01:04:31.880> that<01:04:32.040> is" + }, + { + "start": 3872.19, + "duration": 0.0, + "text": "scratch, I don't know what that is" + }, + { + "start": 3872.2, + "duration": 0.0, + "text": "scratch, I don't know what that is actually<01:04:32.600> off<01:04:32.720> the<01:04:32.800> top<01:04:32.920> of<01:04:33.000> my<01:04:33.080> head." + }, + { + "start": 3874.03, + "duration": 0.0, + "text": "actually off the top of my head." + }, + { + "start": 3874.04, + "duration": 0.0, + "text": "actually off the top of my head. Yeah.<01:04:34.520> Do<01:04:34.680> you<01:04:34.720> have<01:04:34.840> any<01:04:34.960> explanation<01:04:35.600> for" + }, + { + "start": 3875.79, + "duration": 0.0, + "text": "Yeah. Do you have any explanation for" + }, + { + "start": 3875.8, + "duration": 0.0, + "text": "Yeah. Do you have any explanation for why<01:04:36.680> regularization<01:04:37.480> works<01:04:37.680> in<01:04:37.760> some<01:04:37.920> cases?" + }, + { + "start": 3879.19, + "duration": 0.0, + "text": "why regularization works in some cases?" + }, + { + "start": 3879.2, + "duration": 0.0, + "text": "why regularization works in some cases? Well,<01:04:39.359> I<01:04:39.440> guess<01:04:39.600> it's<01:04:39.760> not<01:04:39.920> that" + }, + { + "start": 3880.43, + "duration": 0.0, + "text": "Well, I guess it's not that" + }, + { + "start": 3880.44, + "duration": 0.0, + "text": "Well, I guess it's not that regularization<01:04:41.120> in<01:04:41.240> general<01:04:41.520> affects" + }, + { + "start": 3881.79, + "duration": 0.0, + "text": "regularization in general affects" + }, + { + "start": 3881.8, + "duration": 0.0, + "text": "regularization in general affects optimization.<01:04:42.280> I<01:04:42.320> don't<01:04:42.480> think<01:04:42.600> people<01:04:42.840> do" + }, + { + "start": 3882.91, + "duration": 0.0, + "text": "optimization. I don't think people do" + }, + { + "start": 3882.92, + "duration": 0.0, + "text": "optimization. I don't think people do dropout<01:04:43.359> anymore<01:04:43.720> because,<01:04:44.440> you<01:04:44.520> know,<01:04:44.600> it" + }, + { + "start": 3884.67, + "duration": 0.0, + "text": "dropout anymore because, you know, it" + }, + { + "start": 3884.68, + "duration": 0.0, + "text": "dropout anymore because, you know, it doesn't<01:04:44.960> really<01:04:45.680> uh<01:04:45.760> interact<01:04:46.120> well<01:04:46.240> with" + }, + { + "start": 3886.349, + "duration": 0.0, + "text": "doesn't really uh interact well with" + }, + { + "start": 3886.359, + "duration": 0.0, + "text": "doesn't really uh interact well with optimization.<01:04:46.840> But<01:04:47.000> for<01:04:47.080> example,<01:04:47.400> weight" + }, + { + "start": 3887.59, + "duration": 0.0, + "text": "optimization. But for example, weight" + }, + { + "start": 3887.6, + "duration": 0.0, + "text": "optimization. But for example, weight decay,<01:04:48.400> you<01:04:48.480> know,<01:04:48.560> which<01:04:48.680> is<01:04:48.760> shrinkage<01:04:49.160> to" + }, + { + "start": 3889.27, + "duration": 0.0, + "text": "decay, you know, which is shrinkage to" + }, + { + "start": 3889.28, + "duration": 0.0, + "text": "decay, you know, which is shrinkage to zero,<01:04:50.280> um<01:04:50.440> that<01:04:50.560> might<01:04:50.720> allow<01:04:50.960> you<01:04:51.040> to<01:04:51.200> use<01:04:51.440> a" + }, + { + "start": 3891.59, + "duration": 0.0, + "text": "zero, um that might allow you to use a" + }, + { + "start": 3891.6, + "duration": 0.0, + "text": "zero, um that might allow you to use a higher<01:04:51.920> learning<01:04:52.280> rate<01:04:52.560> or<01:04:52.680> it<01:04:52.760> might<01:04:52.960> allow" + }, + { + "start": 3893.15, + "duration": 0.0, + "text": "higher learning rate or it might allow" + }, + { + "start": 3893.16, + "duration": 0.0, + "text": "higher learning rate or it might allow you<01:04:53.240> to<01:04:53.359> decay<01:04:53.680> faster.<01:04:54.560> There<01:04:54.680> are<01:04:54.720> lots<01:04:55.000> of" + }, + { + "start": 3895.11, + "duration": 0.0, + "text": "you to decay faster. There are lots of" + }, + { + "start": 3895.12, + "duration": 0.0, + "text": "you to decay faster. There are lots of ways<01:04:55.359> in<01:04:55.400> which<01:04:55.600> all<01:04:55.800> these<01:04:56.000> terms<01:04:56.200> are" + }, + { + "start": 3896.31, + "duration": 0.0, + "text": "ways in which all these terms are" + }, + { + "start": 3896.32, + "duration": 0.0, + "text": "ways in which all these terms are interrelated." + }, + { + "start": 3899.359, + "duration": 0.0, + "text": "Cool.<01:04:59.680> Okay." + }, + { + "start": 3901.91, + "duration": 0.0, + "text": "Cool. Okay." + }, + { + "start": 3901.92, + "duration": 0.0, + "text": "Cool. Okay. Now,<01:05:02.480> I've<01:05:02.640> talked<01:05:02.920> a<01:05:02.960> lot<01:05:03.160> about<01:05:03.440> how<01:05:03.560> to" + }, + { + "start": 3903.67, + "duration": 0.0, + "text": "Now, I've talked a lot about how to" + }, + { + "start": 3903.68, + "duration": 0.0, + "text": "Now, I've talked a lot about how to design<01:05:04.800> um<01:05:05.000> expressive<01:05:05.680> models<01:05:06.560> by<01:05:07.200> sort<01:05:07.320> of" + }, + { + "start": 3907.43, + "duration": 0.0, + "text": "design um expressive models by sort of" + }, + { + "start": 3907.44, + "duration": 0.0, + "text": "design um expressive models by sort of looking<01:05:07.680> at<01:05:07.800> all<01:05:07.920> these<01:05:08.120> other<01:05:08.280> models<01:05:08.600> that" + }, + { + "start": 3908.67, + "duration": 0.0, + "text": "looking at all these other models that" + }, + { + "start": 3908.68, + "duration": 0.0, + "text": "looking at all these other models that have<01:05:08.760> been<01:05:08.880> trained." + }, + { + "start": 3909.91, + "duration": 0.0, + "text": "have been trained." + }, + { + "start": 3909.92, + "duration": 0.0, + "text": "have been trained. Um<01:05:10.240> one<01:05:10.400> of<01:05:10.480> the<01:05:10.560> things<01:05:10.840> that<01:05:11.000> I'll<01:05:11.200> I'll" + }, + { + "start": 3911.39, + "duration": 0.0, + "text": "Um one of the things that I'll I'll" + }, + { + "start": 3911.4, + "duration": 0.0, + "text": "Um one of the things that I'll I'll highlight<01:05:11.880> now<01:05:12.320> is<01:05:12.520> over<01:05:12.720> the<01:05:12.840> last<01:05:13.160> few<01:05:13.400> years" + }, + { + "start": 3914.07, + "duration": 0.0, + "text": "highlight now is over the last few years" + }, + { + "start": 3914.08, + "duration": 0.0, + "text": "highlight now is over the last few years um<01:05:14.440> a<01:05:14.520> really<01:05:14.920> big<01:05:15.800> emphasis<01:05:16.280> has<01:05:16.440> not<01:05:16.680> been<01:05:16.800> on" + }, + { + "start": 3916.95, + "duration": 0.0, + "text": "um a really big emphasis has not been on" + }, + { + "start": 3916.96, + "duration": 0.0, + "text": "um a really big emphasis has not been on performance<01:05:17.600> alone.<01:05:18.160> It<01:05:18.280> has<01:05:18.440> actually<01:05:18.760> been" + }, + { + "start": 3918.91, + "duration": 0.0, + "text": "performance alone. It has actually been" + }, + { + "start": 3918.92, + "duration": 0.0, + "text": "performance alone. It has actually been on<01:05:19.040> stability.<01:05:20.320> And<01:05:20.440> this<01:05:20.600> becomes<01:05:20.920> an" + }, + { + "start": 3921.03, + "duration": 0.0, + "text": "on stability. And this becomes an" + }, + { + "start": 3921.04, + "duration": 0.0, + "text": "on stability. And this becomes an increasingly<01:05:21.640> important<01:05:22.040> concern<01:05:22.920> as<01:05:23.080> your" + }, + { + "start": 3923.19, + "duration": 0.0, + "text": "increasingly important concern as your" + }, + { + "start": 3923.2, + "duration": 0.0, + "text": "increasingly important concern as your models<01:05:23.760> get<01:05:24.000> more<01:05:24.200> and<01:05:24.320> more<01:05:24.440> expensive<01:05:25.120> to" + }, + { + "start": 3925.27, + "duration": 0.0, + "text": "models get more and more expensive to" + }, + { + "start": 3925.28, + "duration": 0.0, + "text": "models get more and more expensive to train,<01:05:25.920> right?<01:05:26.560> Um<01:05:26.800> we've<01:05:26.960> kind<01:05:27.160> of<01:05:27.240> seen<01:05:27.440> that" + }, + { + "start": 3927.55, + "duration": 0.0, + "text": "train, right? Um we've kind of seen that" + }, + { + "start": 3927.56, + "duration": 0.0, + "text": "train, right? Um we've kind of seen that a<01:05:27.640> lot<01:05:27.840> of<01:05:27.920> these<01:05:28.080> choices<01:05:28.480> are<01:05:28.600> forgiving," + }, + { + "start": 3929.15, + "duration": 0.0, + "text": "a lot of these choices are forgiving," + }, + { + "start": 3929.16, + "duration": 0.0, + "text": "a lot of these choices are forgiving, right?<01:05:29.320> Everyone's<01:05:29.640> kind<01:05:29.760> of<01:05:29.840> doing<01:05:30.120> similar" + }, + { + "start": 3930.51, + "duration": 0.0, + "text": "right? Everyone's kind of doing similar" + }, + { + "start": 3930.52, + "duration": 0.0, + "text": "right? Everyone's kind of doing similar stuff.<01:05:31.520> And<01:05:31.640> so,<01:05:32.240> you<01:05:32.359> know,<01:05:32.480> you<01:05:32.560> can<01:05:32.720> mess" + }, + { + "start": 3932.95, + "duration": 0.0, + "text": "stuff. And so, you know, you can mess" + }, + { + "start": 3932.96, + "duration": 0.0, + "text": "stuff. And so, you know, you can mess with<01:05:33.120> these,<01:05:33.320> but<01:05:33.440> you're<01:05:33.560> not<01:05:33.680> going<01:05:33.800> to<01:05:33.880> get" + }, + { + "start": 3933.95, + "duration": 0.0, + "text": "with these, but you're not going to get" + }, + { + "start": 3933.96, + "duration": 0.0, + "text": "with these, but you're not going to get a<01:05:34.000> big<01:05:34.160> performance<01:05:34.640> difference.<01:05:35.280> That's" + }, + { + "start": 3935.51, + "duration": 0.0, + "text": "a big performance difference. That's" + }, + { + "start": 3935.52, + "duration": 0.0, + "text": "a big performance difference. That's fine.<01:05:36.400> But<01:05:36.800> if<01:05:36.920> your<01:05:37.040> model,<01:05:37.480> you<01:05:37.520> know," + }, + { + "start": 3937.67, + "duration": 0.0, + "text": "fine. But if your model, you know," + }, + { + "start": 3937.68, + "duration": 0.0, + "text": "fine. But if your model, you know, suddenly<01:05:38.240> blows<01:05:38.640> up<01:05:38.840> some<01:05:39.160> part<01:05:39.440> into" + }, + { + "start": 3939.63, + "duration": 0.0, + "text": "suddenly blows up some part into" + }, + { + "start": 3939.64, + "duration": 0.0, + "text": "suddenly blows up some part into training,<01:05:40.040> like<01:05:40.160> you<01:05:40.240> get<01:05:40.359> these<01:05:40.520> like" + }, + { + "start": 3940.67, + "duration": 0.0, + "text": "training, like you get these like" + }, + { + "start": 3940.68, + "duration": 0.0, + "text": "training, like you get these like horrible-looking<01:05:41.400> spikes<01:05:42.040> all<01:05:42.280> over<01:05:42.440> the" + }, + { + "start": 3942.55, + "duration": 0.0, + "text": "horrible-looking spikes all over the" + }, + { + "start": 3942.56, + "duration": 0.0, + "text": "horrible-looking spikes all over the place,<01:05:43.480> um<01:05:44.080> you<01:05:44.160> know,<01:05:44.240> you<01:05:44.320> might<01:05:44.520> end<01:05:44.680> up" + }, + { + "start": 3944.79, + "duration": 0.0, + "text": "place, um you know, you might end up" + }, + { + "start": 3944.8, + "duration": 0.0, + "text": "place, um you know, you might end up with<01:05:44.920> a<01:05:44.960> model<01:05:45.320> that<01:05:45.480> is,<01:05:45.880> you<01:05:45.960> know,<01:05:46.080> actually" + }, + { + "start": 3946.39, + "duration": 0.0, + "text": "with a model that is, you know, actually" + }, + { + "start": 3946.4, + "duration": 0.0, + "text": "with a model that is, you know, actually not<01:05:46.960> very<01:05:47.280> good<01:05:47.480> quality,<01:05:48.000> right?<01:05:48.240> Or<01:05:48.440> it" + }, + { + "start": 3948.51, + "duration": 0.0, + "text": "not very good quality, right? Or it" + }, + { + "start": 3948.52, + "duration": 0.0, + "text": "not very good quality, right? Or it might<01:05:48.680> be<01:05:48.800> unrecoverable.<01:05:49.560> You<01:05:49.640> might<01:05:49.800> have" + }, + { + "start": 3949.87, + "duration": 0.0, + "text": "might be unrecoverable. You might have" + }, + { + "start": 3949.88, + "duration": 0.0, + "text": "might be unrecoverable. You might have spent,<01:05:50.160> you<01:05:50.240> know,<01:05:50.320> millions<01:05:50.680> of<01:05:50.800> dollars<01:05:51.080> in" + }, + { + "start": 3951.19, + "duration": 0.0, + "text": "spent, you know, millions of dollars in" + }, + { + "start": 3951.2, + "duration": 0.0, + "text": "spent, you know, millions of dollars in training,<01:05:52.040> and,<01:05:52.440> you<01:05:52.480> know,<01:05:52.600> you<01:05:52.680> get<01:05:52.840> to<01:05:52.920> a" + }, + { + "start": 3952.95, + "duration": 0.0, + "text": "training, and, you know, you get to a" + }, + { + "start": 3952.96, + "duration": 0.0, + "text": "training, and, you know, you get to a point<01:05:53.359> where<01:05:53.560> the<01:05:53.680> model<01:05:53.960> is<01:05:54.080> no<01:05:54.240> longer<01:05:54.560> able" + }, + { + "start": 3954.71, + "duration": 0.0, + "text": "point where the model is no longer able" + }, + { + "start": 3954.72, + "duration": 0.0, + "text": "point where the model is no longer able to<01:05:54.800> be<01:05:54.880> trained<01:05:55.200> any<01:05:55.359> further,<01:05:55.760> right?<01:05:55.880> That" + }, + { + "start": 3955.99, + "duration": 0.0, + "text": "to be trained any further, right? That" + }, + { + "start": 3956.0, + "duration": 0.0, + "text": "to be trained any further, right? That would<01:05:56.080> be<01:05:56.160> a<01:05:56.200> horrible<01:05:56.600> thing<01:05:56.800> to<01:05:56.920> happen<01:05:57.560> if" + }, + { + "start": 3957.67, + "duration": 0.0, + "text": "would be a horrible thing to happen if" + }, + { + "start": 3957.68, + "duration": 0.0, + "text": "would be a horrible thing to happen if you<01:05:57.720> have<01:05:57.800> a<01:05:57.840> lot<01:05:58.040> of<01:05:58.120> compute<01:05:58.480> that<01:05:58.600> you<01:05:58.680> want" + }, + { + "start": 3958.79, + "duration": 0.0, + "text": "you have a lot of compute that you want" + }, + { + "start": 3958.8, + "duration": 0.0, + "text": "you have a lot of compute that you want to<01:05:58.880> spend." + }, + { + "start": 3960.27, + "duration": 0.0, + "text": "to spend." + }, + { + "start": 3960.28, + "duration": 0.0, + "text": "to spend. So,<01:06:00.760> you<01:06:00.840> don't<01:06:00.960> want<01:06:01.080> to<01:06:01.160> train<01:06:01.440> models<01:06:01.800> that" + }, + { + "start": 3961.91, + "duration": 0.0, + "text": "So, you don't want to train models that" + }, + { + "start": 3961.92, + "duration": 0.0, + "text": "So, you don't want to train models that look<01:06:02.080> kind<01:06:02.240> of<01:06:02.280> like<01:06:02.400> this<01:06:02.560> blue<01:06:02.800> curve<01:06:03.200> with" + }, + { + "start": 3963.27, + "duration": 0.0, + "text": "look kind of like this blue curve with" + }, + { + "start": 3963.28, + "duration": 0.0, + "text": "look kind of like this blue curve with like<01:06:03.400> spikes<01:06:03.880> everywhere<01:06:04.560> and<01:06:04.720> these,<01:06:04.920> you" + }, + { + "start": 3964.95, + "duration": 0.0, + "text": "like spikes everywhere and these, you" + }, + { + "start": 3964.96, + "duration": 0.0, + "text": "like spikes everywhere and these, you know,<01:06:05.080> big<01:06:05.320> gradient<01:06:05.760> norms<01:06:06.080> happening.<01:06:07.040> Um" + }, + { + "start": 3967.39, + "duration": 0.0, + "text": "know, big gradient norms happening. Um" + }, + { + "start": 3967.4, + "duration": 0.0, + "text": "know, big gradient norms happening. Um so,<01:06:07.600> what<01:06:07.840> do<01:06:08.000> we<01:06:08.160> do<01:06:08.400> to<01:06:08.520> fix<01:06:08.760> these<01:06:08.960> stability" + }, + { + "start": 3969.51, + "duration": 0.0, + "text": "so, what do we do to fix these stability" + }, + { + "start": 3969.52, + "duration": 0.0, + "text": "so, what do we do to fix these stability issues?<01:06:10.120> I<01:06:10.160> mean,<01:06:10.280> this<01:06:10.480> is<01:06:10.600> really,<01:06:11.280> I<01:06:11.359> would" + }, + { + "start": 3971.47, + "duration": 0.0, + "text": "issues? I mean, this is really, I would" + }, + { + "start": 3971.48, + "duration": 0.0, + "text": "issues? I mean, this is really, I would say,<01:06:11.600> like<01:06:11.720> a<01:06:11.800> core<01:06:12.200> core<01:06:12.680> issue." + }, + { + "start": 3973.71, + "duration": 0.0, + "text": "say, like a core core issue." + }, + { + "start": 3973.72, + "duration": 0.0, + "text": "say, like a core core issue. And,<01:06:14.280> you<01:06:14.359> know,<01:06:14.680> if<01:06:14.840> you<01:06:14.960> have<01:06:15.280> stability" + }, + { + "start": 3975.79, + "duration": 0.0, + "text": "And, you know, if you have stability" + }, + { + "start": 3975.8, + "duration": 0.0, + "text": "And, you know, if you have stability issues<01:06:16.640> in<01:06:17.359> language<01:06:17.760> models<01:06:18.080> or<01:06:18.160> in<01:06:18.240> general" + }, + { + "start": 3978.99, + "duration": 0.0, + "text": "issues in language models or in general" + }, + { + "start": 3979.0, + "duration": 0.0, + "text": "issues in language models or in general neural<01:06:19.240> networks,<01:06:20.160> there's<01:06:20.400> a<01:06:20.480> few,<01:06:21.240> you" + }, + { + "start": 3981.31, + "duration": 0.0, + "text": "neural networks, there's a few, you" + }, + { + "start": 3981.32, + "duration": 0.0, + "text": "neural networks, there's a few, you know,<01:06:21.440> usual<01:06:21.840> suspects<01:06:22.520> that<01:06:22.680> you've<01:06:23.080> got<01:06:23.280> to" + }, + { + "start": 3983.31, + "duration": 0.0, + "text": "know, usual suspects that you've got to" + }, + { + "start": 3983.32, + "duration": 0.0, + "text": "know, usual suspects that you've got to start<01:06:23.600> looking<01:06:23.880> at." + }, + { + "start": 3984.63, + "duration": 0.0, + "text": "start looking at." + }, + { + "start": 3984.64, + "duration": 0.0, + "text": "start looking at. Um<01:06:24.960> one<01:06:25.160> of<01:06:25.240> them<01:06:25.440> is<01:06:25.560> the<01:06:25.680> soft<01:06:26.120> maxes,<01:06:26.880> and" + }, + { + "start": 3986.99, + "duration": 0.0, + "text": "Um one of them is the soft maxes, and" + }, + { + "start": 3987.0, + "duration": 0.0, + "text": "Um one of them is the soft maxes, and the<01:06:27.080> soft<01:06:27.400> max<01:06:27.640> has<01:06:27.840> two<01:06:28.000> things<01:06:28.359> that<01:06:28.480> are" + }, + { + "start": 3988.87, + "duration": 0.0, + "text": "the soft max has two things that are" + }, + { + "start": 3988.88, + "duration": 0.0, + "text": "the soft max has two things that are both<01:06:29.240> really<01:06:29.520> bad<01:06:29.840> for<01:06:29.960> stability.<01:06:30.680> One<01:06:30.840> of" + }, + { + "start": 3990.87, + "duration": 0.0, + "text": "both really bad for stability. One of" + }, + { + "start": 3990.88, + "duration": 0.0, + "text": "both really bad for stability. One of them<01:06:31.000> is<01:06:31.040> an<01:06:31.160> exponential,<01:06:31.880> right?<01:06:32.120> We<01:06:32.200> can" + }, + { + "start": 3992.31, + "duration": 0.0, + "text": "them is an exponential, right? We can" + }, + { + "start": 3992.32, + "duration": 0.0, + "text": "them is an exponential, right? We can see<01:06:32.480> how<01:06:32.600> that<01:06:32.760> blows<01:06:33.000> up<01:06:33.120> very<01:06:33.320> quickly.<01:06:34.240> Um" + }, + { + "start": 3994.59, + "duration": 0.0, + "text": "see how that blows up very quickly. Um" + }, + { + "start": 3994.6, + "duration": 0.0, + "text": "see how that blows up very quickly. Um you<01:06:34.680> also<01:06:34.880> divide<01:06:35.520> two<01:06:35.680> numbers,<01:06:36.080> and<01:06:36.200> that's" + }, + { + "start": 3996.67, + "duration": 0.0, + "text": "you also divide two numbers, and that's" + }, + { + "start": 3996.68, + "duration": 0.0, + "text": "you also divide two numbers, and that's also<01:06:37.080> a<01:06:37.120> potentially<01:06:37.720> very<01:06:37.920> dangerous" + }, + { + "start": 3998.349, + "duration": 0.0, + "text": "also a potentially very dangerous" + }, + { + "start": 3998.359, + "duration": 0.0, + "text": "also a potentially very dangerous operation,<01:06:39.040> right?<01:06:39.800> So,<01:06:39.960> a<01:06:40.000> soft<01:06:40.440> max<01:06:40.800> is<01:06:41.000> one" + }, + { + "start": 4001.19, + "duration": 0.0, + "text": "operation, right? So, a soft max is one" + }, + { + "start": 4001.2, + "duration": 0.0, + "text": "operation, right? So, a soft max is one place<01:06:41.440> where<01:06:41.560> you<01:06:41.640> got<01:06:41.760> to<01:06:41.840> be<01:06:42.160> extra,<01:06:42.520> extra" + }, + { + "start": 4002.75, + "duration": 0.0, + "text": "place where you got to be extra, extra" + }, + { + "start": 4002.76, + "duration": 0.0, + "text": "place where you got to be extra, extra careful." + }, + { + "start": 4004.03, + "duration": 0.0, + "text": "careful." + }, + { + "start": 4004.04, + "duration": 0.0, + "text": "careful. And<01:06:44.200> where<01:06:44.359> are<01:06:44.440> the<01:06:44.560> soft<01:06:44.880> maxes<01:06:45.280> in<01:06:45.359> a" + }, + { + "start": 4005.39, + "duration": 0.0, + "text": "And where are the soft maxes in a" + }, + { + "start": 4005.4, + "duration": 0.0, + "text": "And where are the soft maxes in a language<01:06:45.760> model?<01:06:46.400> Well,<01:06:46.520> there's<01:06:46.840> two<01:06:47.120> of" + }, + { + "start": 4007.23, + "duration": 0.0, + "text": "language model? Well, there's two of" + }, + { + "start": 4007.24, + "duration": 0.0, + "text": "language model? Well, there's two of them.<01:06:47.920> There's<01:06:48.280> one<01:06:48.600> on<01:06:48.720> the<01:06:48.840> output<01:06:49.160> side" + }, + { + "start": 4009.55, + "duration": 0.0, + "text": "them. There's one on the output side" + }, + { + "start": 4009.56, + "duration": 0.0, + "text": "them. There's one on the output side when<01:06:49.680> we<01:06:49.840> output<01:06:50.200> our<01:06:50.320> probability" + }, + { + "start": 4010.79, + "duration": 0.0, + "text": "when we output our probability" + }, + { + "start": 4010.8, + "duration": 0.0, + "text": "when we output our probability distribution,<01:06:51.760> and<01:06:51.880> then<01:06:52.080> in<01:06:52.200> attention<01:06:52.760> when" + }, + { + "start": 4012.87, + "duration": 0.0, + "text": "distribution, and then in attention when" + }, + { + "start": 4012.88, + "duration": 0.0, + "text": "distribution, and then in attention when we<01:06:53.000> normalize<01:06:53.520> the<01:06:53.600> attention,<01:06:54.200> there's" + }, + { + "start": 4014.349, + "duration": 0.0, + "text": "we normalize the attention, there's" + }, + { + "start": 4014.359, + "duration": 0.0, + "text": "we normalize the attention, there's going<01:06:54.480> to<01:06:54.520> be<01:06:54.600> another<01:06:54.840> soft<01:06:55.120> max,<01:06:55.440> right?<01:06:56.000> So," + }, + { + "start": 4016.27, + "duration": 0.0, + "text": "going to be another soft max, right? So," + }, + { + "start": 4016.28, + "duration": 0.0, + "text": "going to be another soft max, right? So, we<01:06:56.400> can<01:06:56.560> think<01:06:56.760> of<01:06:56.880> both<01:06:57.120> of<01:06:57.200> those<01:06:57.400> as<01:06:57.520> really" + }, + { + "start": 4017.71, + "duration": 0.0, + "text": "we can think of both of those as really" + }, + { + "start": 4017.72, + "duration": 0.0, + "text": "we can think of both of those as really kind<01:06:57.880> of<01:06:57.960> danger<01:06:58.400> zones<01:06:59.200> for<01:06:59.640> our<01:06:59.800> model,<01:07:00.600> um" + }, + { + "start": 4021.23, + "duration": 0.0, + "text": "kind of danger zones for our model, um" + }, + { + "start": 4021.24, + "duration": 0.0, + "text": "kind of danger zones for our model, um especially<01:07:01.880> the<01:07:01.960> attention." + }, + { + "start": 4025.44, + "duration": 0.0, + "text": "But<01:07:05.520> okay." + }, + { + "start": 4026.67, + "duration": 0.0, + "text": "But okay." + }, + { + "start": 4026.68, + "duration": 0.0, + "text": "But okay. Let's<01:07:06.880> start<01:07:07.080> with<01:07:07.240> thinking<01:07:07.440> about<01:07:07.640> the" + }, + { + "start": 4027.79, + "duration": 0.0, + "text": "Let's start with thinking about the" + }, + { + "start": 4027.8, + "duration": 0.0, + "text": "Let's start with thinking about the output<01:07:08.160> soft<01:07:08.440> max.<01:07:09.160> The<01:07:09.280> output<01:07:09.640> soft<01:07:09.920> max<01:07:10.200> can" + }, + { + "start": 4030.349, + "duration": 0.0, + "text": "output soft max. The output soft max can" + }, + { + "start": 4030.359, + "duration": 0.0, + "text": "output soft max. The output soft max can blow<01:07:10.600> up<01:07:10.800> on<01:07:11.000> us.<01:07:11.680> Um" + }, + { + "start": 4032.47, + "duration": 0.0, + "text": "blow up on us. Um" + }, + { + "start": 4032.48, + "duration": 0.0, + "text": "blow up on us. Um and<01:07:12.840> one<01:07:12.960> of<01:07:13.040> the<01:07:13.120> things<01:07:13.440> that<01:07:13.560> we<01:07:13.680> can<01:07:13.880> do<01:07:14.520> is" + }, + { + "start": 4034.71, + "duration": 0.0, + "text": "and one of the things that we can do is" + }, + { + "start": 4034.72, + "duration": 0.0, + "text": "and one of the things that we can do is we<01:07:14.800> can<01:07:14.920> try<01:07:15.160> to<01:07:15.280> control" + }, + { + "start": 4036.75, + "duration": 0.0, + "text": "we can try to control" + }, + { + "start": 4036.76, + "duration": 0.0, + "text": "we can try to control um<01:07:16.960> sort<01:07:17.160> of<01:07:17.240> the<01:07:17.600> the<01:07:17.720> normalizer<01:07:18.440> problem." + }, + { + "start": 4039.03, + "duration": 0.0, + "text": "um sort of the the normalizer problem." + }, + { + "start": 4039.04, + "duration": 0.0, + "text": "um sort of the the normalizer problem. So,<01:07:19.920> you<01:07:20.000> know,<01:07:20.120> let's<01:07:20.480> sort<01:07:20.680> of<01:07:20.760> think<01:07:20.920> about" + }, + { + "start": 4041.11, + "duration": 0.0, + "text": "So, you know, let's sort of think about" + }, + { + "start": 4041.12, + "duration": 0.0, + "text": "So, you know, let's sort of think about the<01:07:21.160> soft<01:07:21.480> max<01:07:21.760> calculation.<01:07:22.359> We<01:07:22.480> want<01:07:22.600> to" + }, + { + "start": 4042.63, + "duration": 0.0, + "text": "the soft max calculation. We want to" + }, + { + "start": 4042.64, + "duration": 0.0, + "text": "the soft max calculation. We want to compute<01:07:23.040> a<01:07:23.080> log<01:07:23.480> probability<01:07:24.120> to<01:07:24.240> compute<01:07:24.560> the" + }, + { + "start": 4044.63, + "duration": 0.0, + "text": "compute a log probability to compute the" + }, + { + "start": 4044.64, + "duration": 0.0, + "text": "compute a log probability to compute the loss.<01:07:25.560> Now,<01:07:25.760> what<01:07:25.920> is<01:07:26.000> a<01:07:26.080> log<01:07:26.320> probability?" + }, + { + "start": 4046.91, + "duration": 0.0, + "text": "loss. Now, what is a log probability?" + }, + { + "start": 4046.92, + "duration": 0.0, + "text": "loss. Now, what is a log probability? Well,<01:07:27.160> it's,<01:07:27.720> you<01:07:27.800> know,<01:07:28.000> the<01:07:28.160> output<01:07:28.480> of<01:07:28.560> your" + }, + { + "start": 4048.63, + "duration": 0.0, + "text": "Well, it's, you know, the output of your" + }, + { + "start": 4048.64, + "duration": 0.0, + "text": "Well, it's, you know, the output of your model<01:07:29.000> U,<01:07:29.840> and<01:07:30.000> then<01:07:30.160> you've<01:07:30.280> got<01:07:30.440> this<01:07:30.640> log" + }, + { + "start": 4051.07, + "duration": 0.0, + "text": "model U, and then you've got this log" + }, + { + "start": 4051.08, + "duration": 0.0, + "text": "model U, and then you've got this log normalizer,<01:07:32.160> right?<01:07:32.720> This<01:07:32.920> U<01:07:33.240> is" + }, + { + "start": 4053.39, + "duration": 0.0, + "text": "normalizer, right? This U is" + }, + { + "start": 4053.4, + "duration": 0.0, + "text": "normalizer, right? This U is well-behaved<01:07:34.240> because<01:07:34.480> in<01:07:34.560> some<01:07:34.720> sense<01:07:34.880> this" + }, + { + "start": 4055.03, + "duration": 0.0, + "text": "well-behaved because in some sense this" + }, + { + "start": 4055.04, + "duration": 0.0, + "text": "well-behaved because in some sense this is<01:07:35.120> the<01:07:35.240> output<01:07:35.520> of<01:07:35.600> your<01:07:35.720> model,<01:07:36.320> right?<01:07:36.480> This" + }, + { + "start": 4056.63, + "duration": 0.0, + "text": "is the output of your model, right? This" + }, + { + "start": 4056.64, + "duration": 0.0, + "text": "is the output of your model, right? This is<01:07:36.760> just<01:07:36.960> the<01:07:37.240> output<01:07:37.520> of<01:07:37.600> your<01:07:37.680> residual" + }, + { + "start": 4058.11, + "duration": 0.0, + "text": "is just the output of your residual" + }, + { + "start": 4058.12, + "duration": 0.0, + "text": "is just the output of your residual stream<01:07:38.440> with<01:07:38.560> all<01:07:38.640> the<01:07:38.760> things<01:07:39.040> that<01:07:39.160> are" + }, + { + "start": 4059.27, + "duration": 0.0, + "text": "stream with all the things that are" + }, + { + "start": 4059.28, + "duration": 0.0, + "text": "stream with all the things that are added<01:07:39.560> in.<01:07:39.720> So,<01:07:39.840> if<01:07:40.320> U<01:07:40.680> is<01:07:40.840> well-behaved,<01:07:41.600> then" + }, + { + "start": 4061.83, + "duration": 0.0, + "text": "added in. So, if U is well-behaved, then" + }, + { + "start": 4061.84, + "duration": 0.0, + "text": "added in. So, if U is well-behaved, then log<01:07:42.400> P,<01:07:42.600> the<01:07:42.720> first<01:07:43.000> term,<01:07:43.160> is<01:07:43.280> well-behaved," + }, + { + "start": 4063.83, + "duration": 0.0, + "text": "log P, the first term, is well-behaved," + }, + { + "start": 4063.84, + "duration": 0.0, + "text": "log P, the first term, is well-behaved, right?<01:07:43.960> If<01:07:44.000> the<01:07:44.080> model<01:07:44.359> is<01:07:44.440> being<01:07:44.640> okay." + }, + { + "start": 4065.55, + "duration": 0.0, + "text": "right? If the model is being okay." + }, + { + "start": 4065.56, + "duration": 0.0, + "text": "right? If the model is being okay. Now,<01:07:45.680> the<01:07:45.800> second<01:07:46.280> term,<01:07:46.680> this<01:07:46.880> log<01:07:47.280> Z,<01:07:48.040> this" + }, + { + "start": 4068.39, + "duration": 0.0, + "text": "Now, the second term, this log Z, this" + }, + { + "start": 4068.4, + "duration": 0.0, + "text": "Now, the second term, this log Z, this might<01:07:48.600> not<01:07:48.840> be<01:07:49.000> so<01:07:49.280> okay,<01:07:49.800> right?<01:07:50.040> If<01:07:50.200> Z<01:07:50.440> is" + }, + { + "start": 4070.59, + "duration": 0.0, + "text": "might not be so okay, right? If Z is" + }, + { + "start": 4070.6, + "duration": 0.0, + "text": "might not be so okay, right? If Z is really<01:07:50.880> big<01:07:51.120> or<01:07:51.240> really<01:07:51.520> small,<01:07:52.400> even<01:07:52.640> if<01:07:52.720> the" + }, + { + "start": 4072.83, + "duration": 0.0, + "text": "really big or really small, even if the" + }, + { + "start": 4072.84, + "duration": 0.0, + "text": "really big or really small, even if the output<01:07:53.080> of<01:07:53.160> your<01:07:53.320> model<01:07:53.720> is<01:07:53.960> somewhat" + }, + { + "start": 4074.31, + "duration": 0.0, + "text": "output of your model is somewhat" + }, + { + "start": 4074.32, + "duration": 0.0, + "text": "output of your model is somewhat well-behaved,<01:07:54.840> it<01:07:54.960> could<01:07:55.120> blow<01:07:55.400> up.<01:07:55.560> And<01:07:55.680> what" + }, + { + "start": 4075.79, + "duration": 0.0, + "text": "well-behaved, it could blow up. And what" + }, + { + "start": 4075.8, + "duration": 0.0, + "text": "well-behaved, it could blow up. And what is<01:07:55.920> Z?<01:07:56.160> Well,<01:07:56.320> it's<01:07:56.440> an<01:07:56.520> exponential,<01:07:57.320> right?" + }, + { + "start": 4077.51, + "duration": 0.0, + "text": "is Z? Well, it's an exponential, right?" + }, + { + "start": 4077.52, + "duration": 0.0, + "text": "is Z? Well, it's an exponential, right? So,<01:07:57.640> it<01:07:57.760> could<01:07:58.040> potentially<01:07:58.640> blow<01:07:58.800> up<01:07:58.920> very" + }, + { + "start": 4079.11, + "duration": 0.0, + "text": "So, it could potentially blow up very" + }, + { + "start": 4079.12, + "duration": 0.0, + "text": "So, it could potentially blow up very quickly<01:07:59.520> on<01:07:59.680> you." + }, + { + "start": 4080.31, + "duration": 0.0, + "text": "quickly on you." + }, + { + "start": 4080.32, + "duration": 0.0, + "text": "quickly on you. Or<01:08:00.480> if<01:08:00.600> this<01:08:00.760> is<01:08:00.840> zero,<01:08:01.120> it<01:08:01.240> could<01:08:01.359> also<01:08:01.560> blow" + }, + { + "start": 4081.75, + "duration": 0.0, + "text": "Or if this is zero, it could also blow" + }, + { + "start": 4081.76, + "duration": 0.0, + "text": "Or if this is zero, it could also blow up<01:08:01.880> on<01:08:02.000> you,<01:08:02.040> right?<01:08:02.200> So,<01:08:02.320> both<01:08:02.560> of<01:08:02.600> those" + }, + { + "start": 4082.79, + "duration": 0.0, + "text": "up on you, right? So, both of those" + }, + { + "start": 4082.8, + "duration": 0.0, + "text": "up on you, right? So, both of those directions<01:08:03.160> are<01:08:03.240> very,<01:08:03.560> very<01:08:03.880> bad." + }, + { + "start": 4084.87, + "duration": 0.0, + "text": "directions are very, very bad." + }, + { + "start": 4084.88, + "duration": 0.0, + "text": "directions are very, very bad. Now,<01:08:05.200> we<01:08:05.320> would<01:08:05.480> ideally<01:08:06.040> like<01:08:06.480> our<01:08:06.640> Z<01:08:07.080> to<01:08:07.200> be" + }, + { + "start": 4087.31, + "duration": 0.0, + "text": "Now, we would ideally like our Z to be" + }, + { + "start": 4087.32, + "duration": 0.0, + "text": "Now, we would ideally like our Z to be somewhere<01:08:07.680> near<01:08:08.000> one,<01:08:08.560> right?" + }, + { + "start": 4089.79, + "duration": 0.0, + "text": "somewhere near one, right?" + }, + { + "start": 4089.8, + "duration": 0.0, + "text": "somewhere near one, right? Um<01:08:10.480> or<01:08:10.640> log<01:08:11.000> Z<01:08:11.280> to<01:08:11.400> be<01:08:11.520> somewhere<01:08:11.800> near<01:08:12.000> zero." + }, + { + "start": 4093.03, + "duration": 0.0, + "text": "Um or log Z to be somewhere near zero." + }, + { + "start": 4093.04, + "duration": 0.0, + "text": "Um or log Z to be somewhere near zero. Um<01:08:13.440> what<01:08:13.680> can<01:08:13.800> we<01:08:13.960> do?<01:08:14.640> Well,<01:08:14.880> one<01:08:15.040> of<01:08:15.080> the" + }, + { + "start": 4095.19, + "duration": 0.0, + "text": "Um what can we do? Well, one of the" + }, + { + "start": 4095.2, + "duration": 0.0, + "text": "Um what can we do? Well, one of the things<01:08:15.480> that<01:08:15.640> you<01:08:15.760> notice,<01:08:16.560> right?<01:08:16.759> If<01:08:16.880> you" + }, + { + "start": 4096.99, + "duration": 0.0, + "text": "things that you notice, right? If you" + }, + { + "start": 4097.0, + "duration": 0.0, + "text": "things that you notice, right? If you sort<01:08:17.359> of<01:08:17.440> thought<01:08:17.719> about<01:08:17.960> the<01:08:18.080> action<01:08:18.440> of<01:08:18.520> the" + }, + { + "start": 4098.59, + "duration": 0.0, + "text": "sort of thought about the action of the" + }, + { + "start": 4098.6, + "duration": 0.0, + "text": "sort of thought about the action of the soft<01:08:18.960> max,<01:08:19.839> is<01:08:20.160> this<01:08:20.319> whole<01:08:20.480> thing<01:08:20.600> is" + }, + { + "start": 4100.709, + "duration": 0.0, + "text": "soft max, is this whole thing is" + }, + { + "start": 4100.719, + "duration": 0.0, + "text": "soft max, is this whole thing is overparameterized,<01:08:22.200> right?<01:08:22.839> Um<01:08:23.200> I<01:08:23.280> could" + }, + { + "start": 4103.47, + "duration": 0.0, + "text": "overparameterized, right? Um I could" + }, + { + "start": 4103.48, + "duration": 0.0, + "text": "overparameterized, right? Um I could sort<01:08:23.680> of<01:08:24.080> push<01:08:24.359> things<01:08:24.640> in<01:08:24.759> and<01:08:24.920> out.<01:08:25.160> So,<01:08:25.240> if<01:08:25.400> I" + }, + { + "start": 4105.51, + "duration": 0.0, + "text": "sort of push things in and out. So, if I" + }, + { + "start": 4105.52, + "duration": 0.0, + "text": "sort of push things in and out. So, if I add<01:08:25.719> a<01:08:25.759> constant<01:08:26.359> to<01:08:26.480> U,<01:08:27.160> I<01:08:27.240> can<01:08:27.400> manipulate" + }, + { + "start": 4107.99, + "duration": 0.0, + "text": "add a constant to U, I can manipulate" + }, + { + "start": 4108.0, + "duration": 0.0, + "text": "add a constant to U, I can manipulate the<01:08:28.160> Zs<01:08:28.960> without<01:08:29.280> really<01:08:29.520> affecting<01:08:30.000> the" + }, + { + "start": 4110.11, + "duration": 0.0, + "text": "the Zs without really affecting the" + }, + { + "start": 4110.12, + "duration": 0.0, + "text": "the Zs without really affecting the output<01:08:30.400> of<01:08:30.480> the<01:08:30.560> soft<01:08:30.839> max,<01:08:31.080> right?<01:08:31.200> You<01:08:31.280> can" + }, + { + "start": 4111.39, + "duration": 0.0, + "text": "output of the soft max, right? You can" + }, + { + "start": 4111.4, + "duration": 0.0, + "text": "output of the soft max, right? You can cancel<01:08:31.880> out<01:08:32.040> between<01:08:32.319> the<01:08:32.400> normalizer<01:08:33.480> and" + }, + { + "start": 4113.829, + "duration": 0.0, + "text": "cancel out between the normalizer and" + }, + { + "start": 4113.839, + "duration": 0.0, + "text": "cancel out between the normalizer and sort<01:08:34.000> of<01:08:34.080> the<01:08:34.200> output<01:08:34.480> of<01:08:34.600> my<01:08:34.759> model." + }, + { + "start": 4115.789, + "duration": 0.0, + "text": "sort of the output of my model." + }, + { + "start": 4115.799, + "duration": 0.0, + "text": "sort of the output of my model. So,<01:08:35.960> because<01:08:36.319> of<01:08:36.440> that<01:08:36.640> property,<01:08:37.640> one<01:08:37.799> thing" + }, + { + "start": 4117.95, + "duration": 0.0, + "text": "So, because of that property, one thing" + }, + { + "start": 4117.96, + "duration": 0.0, + "text": "So, because of that property, one thing that<01:08:38.080> I<01:08:38.160> could<01:08:38.359> do<01:08:38.520> is<01:08:38.640> I<01:08:38.719> could<01:08:38.839> add<01:08:38.960> a" + }, + { + "start": 4118.99, + "duration": 0.0, + "text": "that I could do is I could add a" + }, + { + "start": 4119.0, + "duration": 0.0, + "text": "that I could do is I could add a regularizer.<01:08:40.040> Um<01:08:40.160> this<01:08:40.280> is<01:08:40.359> from<01:08:40.680> from<01:08:40.839> Jacob" + }, + { + "start": 4121.19, + "duration": 0.0, + "text": "regularizer. Um this is from from Jacob" + }, + { + "start": 4121.2, + "duration": 0.0, + "text": "regularizer. Um this is from from Jacob Devlin's<01:08:41.560> paper<01:08:41.799> 2024,<01:08:42.960> uh<01:08:43.000> sorry,<01:08:43.160> 2014,<01:08:44.400> um" + }, + { + "start": 4124.59, + "duration": 0.0, + "text": "Devlin's paper 2024, uh sorry, 2014, um" + }, + { + "start": 4124.6, + "duration": 0.0, + "text": "Devlin's paper 2024, uh sorry, 2014, um in<01:08:44.680> which<01:08:44.839> he<01:08:45.000> adds<01:08:45.359> sort<01:08:45.520> of<01:08:45.600> this<01:08:45.759> squared" + }, + { + "start": 4126.829, + "duration": 0.0, + "text": "in which he adds sort of this squared" + }, + { + "start": 4126.839, + "duration": 0.0, + "text": "in which he adds sort of this squared log<01:08:47.480> Z<01:08:47.759> term.<01:08:48.640> Um" + }, + { + "start": 4129.23, + "duration": 0.0, + "text": "log Z term. Um" + }, + { + "start": 4129.24, + "duration": 0.0, + "text": "log Z term. Um and<01:08:49.359> what<01:08:49.480> this<01:08:49.640> is<01:08:49.799> doing<01:08:50.319> is<01:08:50.480> it's<01:08:50.600> just" + }, + { + "start": 4130.789, + "duration": 0.0, + "text": "and what this is doing is it's just" + }, + { + "start": 4130.799, + "duration": 0.0, + "text": "and what this is doing is it's just penalizing<01:08:51.759> how<01:08:51.960> far<01:08:52.359> away<01:08:52.759> your<01:08:52.960> log<01:08:53.319> Z<01:08:53.560> is" + }, + { + "start": 4134.03, + "duration": 0.0, + "text": "penalizing how far away your log Z is" + }, + { + "start": 4134.04, + "duration": 0.0, + "text": "penalizing how far away your log Z is from<01:08:54.280> zero.<01:08:54.960> And<01:08:55.080> if<01:08:55.240> log<01:08:55.520> Z<01:08:55.680> is<01:08:55.799> near<01:08:56.040> zero," + }, + { + "start": 4136.55, + "duration": 0.0, + "text": "from zero. And if log Z is near zero," + }, + { + "start": 4136.56, + "duration": 0.0, + "text": "from zero. And if log Z is near zero, that's<01:08:56.799> nice<01:08:57.120> because<01:08:57.359> this<01:08:57.520> whole" + }, + { + "start": 4138.07, + "duration": 0.0, + "text": "that's nice because this whole" + }, + { + "start": 4138.08, + "duration": 0.0, + "text": "that's nice because this whole expression<01:08:59.000> is<01:08:59.200> kind<01:08:59.359> of<01:08:59.520> numerically" + }, + { + "start": 4140.11, + "duration": 0.0, + "text": "expression is kind of numerically" + }, + { + "start": 4140.12, + "duration": 0.0, + "text": "expression is kind of numerically stable." + }, + { + "start": 4141.43, + "duration": 0.0, + "text": "stable." + }, + { + "start": 4141.44, + "duration": 0.0, + "text": "stable. This<01:09:01.680> is<01:09:01.799> called<01:09:02.040> the<01:09:02.160> Z<01:09:02.480> loss<01:09:02.839> trick.<01:09:03.640> Um<01:09:03.839> it's" + }, + { + "start": 4143.95, + "duration": 0.0, + "text": "This is called the Z loss trick. Um it's" + }, + { + "start": 4143.96, + "duration": 0.0, + "text": "This is called the Z loss trick. Um it's been<01:09:04.120> used<01:09:04.359> by<01:09:04.480> a<01:09:04.520> number<01:09:04.880> of<01:09:05.000> papers.<01:09:05.759> Um" + }, + { + "start": 4146.11, + "duration": 0.0, + "text": "been used by a number of papers. Um" + }, + { + "start": 4146.12, + "duration": 0.0, + "text": "been used by a number of papers. Um Jacob<01:09:06.520> Devlin<01:09:06.920> and<01:09:07.040> others<01:09:08.040> uh<01:09:08.160> sort<01:09:08.319> of" + }, + { + "start": 4148.709, + "duration": 0.0, + "text": "Jacob Devlin and others uh sort of" + }, + { + "start": 4148.719, + "duration": 0.0, + "text": "Jacob Devlin and others uh sort of popular<01:09:09.160> or<01:09:09.359> initially<01:09:09.880> pioneered<01:09:10.400> this<01:09:10.560> back" + }, + { + "start": 4150.749, + "duration": 0.0, + "text": "popular or initially pioneered this back" + }, + { + "start": 4150.759, + "duration": 0.0, + "text": "popular or initially pioneered this back in<01:09:10.839> 2014,<01:09:12.000> and<01:09:12.120> then<01:09:12.200> it's<01:09:12.319> become<01:09:12.600> popular" + }, + { + "start": 4153.15, + "duration": 0.0, + "text": "in 2014, and then it's become popular" + }, + { + "start": 4153.16, + "duration": 0.0, + "text": "in 2014, and then it's become popular again<01:09:13.520> through<01:09:13.719> a<01:09:13.759> number<01:09:14.200> of<01:09:14.319> open-source" + }, + { + "start": 4154.91, + "duration": 0.0, + "text": "again through a number of open-source" + }, + { + "start": 4154.92, + "duration": 0.0, + "text": "again through a number of open-source models.<01:09:15.359> Like,<01:09:15.520> Baichuan<01:09:16.000> I<01:09:16.040> think<01:09:16.240> was<01:09:16.359> the" + }, + { + "start": 4156.43, + "duration": 0.0, + "text": "models. Like, Baichuan I think was the" + }, + { + "start": 4156.44, + "duration": 0.0, + "text": "models. Like, Baichuan I think was the first<01:09:16.799> open-source<01:09:17.319> model<01:09:17.560> to<01:09:17.680> do<01:09:17.799> it,<01:09:18.200> but" + }, + { + "start": 4158.309, + "duration": 0.0, + "text": "first open-source model to do it, but" + }, + { + "start": 4158.319, + "duration": 0.0, + "text": "first open-source model to do it, but then<01:09:18.440> DCLM<01:09:19.160> and<01:09:19.280> Almo<01:09:19.600> and<01:09:19.719> others<01:09:20.000> have<01:09:20.120> been" + }, + { + "start": 4160.27, + "duration": 0.0, + "text": "then DCLM and Almo and others have been" + }, + { + "start": 4160.28, + "duration": 0.0, + "text": "then DCLM and Almo and others have been using<01:09:20.560> this<01:09:20.759> trick<01:09:21.319> to<01:09:21.440> stabilize<01:09:22.040> their" + }, + { + "start": 4162.15, + "duration": 0.0, + "text": "using this trick to stabilize their" + }, + { + "start": 4162.16, + "duration": 0.0, + "text": "using this trick to stabilize their output<01:09:22.440> soft<01:09:22.719> maxes." + }, + { + "start": 4163.749, + "duration": 0.0, + "text": "output soft maxes." + }, + { + "start": 4163.759, + "duration": 0.0, + "text": "output soft maxes. So,<01:09:23.799> this<01:09:24.000> is<01:09:24.120> this<01:09:24.280> is<01:09:24.359> a<01:09:24.759> surprisingly" + }, + { + "start": 4165.51, + "duration": 0.0, + "text": "So, this is this is a surprisingly" + }, + { + "start": 4165.52, + "duration": 0.0, + "text": "So, this is this is a surprisingly effective<01:09:25.880> thing." + }, + { + "start": 4167.11, + "duration": 0.0, + "text": "effective thing." + }, + { + "start": 4167.12, + "duration": 0.0, + "text": "effective thing. Now,<01:09:27.440> okay.<01:09:27.719> So,<01:09:27.839> let's<01:09:28.080> say<01:09:28.240> we've<01:09:28.480> handled" + }, + { + "start": 4169.269, + "duration": 0.0, + "text": "Now, okay. So, let's say we've handled" + }, + { + "start": 4169.279, + "duration": 0.0, + "text": "Now, okay. So, let's say we've handled the<01:09:29.560> instability<01:09:30.200> issues<01:09:30.520> on<01:09:30.640> the<01:09:30.759> output" + }, + { + "start": 4171.03, + "duration": 0.0, + "text": "the instability issues on the output" + }, + { + "start": 4171.04, + "duration": 0.0, + "text": "the instability issues on the output soft<01:09:31.400> max." + }, + { + "start": 4172.91, + "duration": 0.0, + "text": "soft max." + }, + { + "start": 4172.92, + "duration": 0.0, + "text": "soft max. Now,<01:09:33.160> we<01:09:33.240> have<01:09:33.400> to<01:09:33.480> sort<01:09:33.640> of<01:09:33.720> turn<01:09:33.960> our" + }, + { + "start": 4174.03, + "duration": 0.0, + "text": "Now, we have to sort of turn our" + }, + { + "start": 4174.04, + "duration": 0.0, + "text": "Now, we have to sort of turn our attention<01:09:34.400> towards<01:09:34.560> the<01:09:34.720> other<01:09:35.000> potential" + }, + { + "start": 4175.43, + "duration": 0.0, + "text": "attention towards the other potential" + }, + { + "start": 4175.44, + "duration": 0.0, + "text": "attention towards the other potential problem,<01:09:36.279> which<01:09:36.440> is<01:09:36.560> attention,<01:09:37.160> right?<01:09:37.680> And" + }, + { + "start": 4177.749, + "duration": 0.0, + "text": "problem, which is attention, right? And" + }, + { + "start": 4177.759, + "duration": 0.0, + "text": "problem, which is attention, right? And this<01:09:37.960> is<01:09:38.120> a<01:09:39.000> a<01:09:39.080> place<01:09:39.480> where<01:09:39.640> lots<01:09:40.200> of" + }, + { + "start": 4180.39, + "duration": 0.0, + "text": "this is a a place where lots of" + }, + { + "start": 4180.4, + "duration": 0.0, + "text": "this is a a place where lots of degeneracies<01:09:41.080> happen.<01:09:41.480> Lots<01:09:41.759> of<01:09:41.839> techniques" + }, + { + "start": 4182.23, + "duration": 0.0, + "text": "degeneracies happen. Lots of techniques" + }, + { + "start": 4182.24, + "duration": 0.0, + "text": "degeneracies happen. Lots of techniques have<01:09:42.400> been<01:09:42.520> developed<01:09:43.480> to<01:09:43.680> control<01:09:44.279> the" + }, + { + "start": 4184.39, + "duration": 0.0, + "text": "have been developed to control the" + }, + { + "start": 4184.4, + "duration": 0.0, + "text": "have been developed to control the instability<01:09:45.000> that<01:09:45.319> attention<01:09:46.240> operations" + }, + { + "start": 4186.789, + "duration": 0.0, + "text": "instability that attention operations" + }, + { + "start": 4186.799, + "duration": 0.0, + "text": "instability that attention operations generate." + }, + { + "start": 4187.95, + "duration": 0.0, + "text": "generate." + }, + { + "start": 4187.96, + "duration": 0.0, + "text": "generate. Um<01:09:48.600> and<01:09:48.759> really<01:09:49.040> the,<01:09:49.600> you<01:09:49.680> know,<01:09:49.759> the" + }, + { + "start": 4189.87, + "duration": 0.0, + "text": "Um and really the, you know, the" + }, + { + "start": 4189.88, + "duration": 0.0, + "text": "Um and really the, you know, the high-level<01:09:50.359> thing<01:09:50.520> that<01:09:50.640> I'll<01:09:50.759> say" + }, + { + "start": 4192.03, + "duration": 0.0, + "text": "high-level thing that I'll say" + }, + { + "start": 4192.04, + "duration": 0.0, + "text": "high-level thing that I'll say is" + }, + { + "start": 4193.07, + "duration": 0.0, + "text": "is" + }, + { + "start": 4193.08, + "duration": 0.0, + "text": "is if<01:09:53.279> you<01:09:53.359> have<01:09:53.600> instability,<01:09:54.400> if<01:09:54.520> you<01:09:54.600> can" + }, + { + "start": 4194.75, + "duration": 0.0, + "text": "if you have instability, if you can" + }, + { + "start": 4194.76, + "duration": 0.0, + "text": "if you have instability, if you can throw<01:09:55.040> a<01:09:55.080> layer<01:09:55.360> norm<01:09:55.560> in<01:09:55.680> there<01:09:55.800> somehow,<01:09:56.680> it" + }, + { + "start": 4196.79, + "duration": 0.0, + "text": "throw a layer norm in there somehow, it" + }, + { + "start": 4196.8, + "duration": 0.0, + "text": "throw a layer norm in there somehow, it might<01:09:57.000> control<01:09:57.400> it.<01:09:57.800> And<01:09:57.880> that's<01:09:58.120> really<01:09:58.480> in" + }, + { + "start": 4198.67, + "duration": 0.0, + "text": "might control it. And that's really in" + }, + { + "start": 4198.68, + "duration": 0.0, + "text": "might control it. And that's really in some<01:09:58.960> sense<01:09:59.280> the<01:10:00.080> the<01:10:00.200> design<01:10:00.640> philosophy" + }, + { + "start": 4201.35, + "duration": 0.0, + "text": "some sense the the design philosophy" + }, + { + "start": 4201.36, + "duration": 0.0, + "text": "some sense the the design philosophy behind<01:10:01.720> this<01:10:01.880> idea<01:10:02.240> called<01:10:02.480> the<01:10:02.600> QK<01:10:03.080> norm.<01:10:03.920> Um" + }, + { + "start": 4204.51, + "duration": 0.0, + "text": "behind this idea called the QK norm. Um" + }, + { + "start": 4204.52, + "duration": 0.0, + "text": "behind this idea called the QK norm. Um so,<01:10:04.680> what<01:10:04.840> you<01:10:05.000> do<01:10:05.240> is<01:10:05.360> remember<01:10:05.760> that<01:10:05.920> we" + }, + { + "start": 4206.03, + "duration": 0.0, + "text": "so, what you do is remember that we" + }, + { + "start": 4206.04, + "duration": 0.0, + "text": "so, what you do is remember that we have,<01:10:06.600> you<01:10:06.680> know,<01:10:06.880> our<01:10:07.040> Qs<01:10:07.480> and<01:10:07.600> Ks<01:10:08.680> um<01:10:09.080> that" + }, + { + "start": 4209.19, + "duration": 0.0, + "text": "have, you know, our Qs and Ks um that" + }, + { + "start": 4209.2, + "duration": 0.0, + "text": "have, you know, our Qs and Ks um that are<01:10:09.320> going<01:10:09.600> to<01:10:09.680> be<01:10:09.840> multiplied<01:10:10.520> together,<01:10:11.200> and" + }, + { + "start": 4211.31, + "duration": 0.0, + "text": "are going to be multiplied together, and" + }, + { + "start": 4211.32, + "duration": 0.0, + "text": "are going to be multiplied together, and then<01:10:11.440> they're<01:10:11.520> going<01:10:11.640> to<01:10:11.720> go<01:10:11.880> into<01:10:12.040> the<01:10:12.120> soft" + }, + { + "start": 4212.47, + "duration": 0.0, + "text": "then they're going to go into the soft" + }, + { + "start": 4212.48, + "duration": 0.0, + "text": "then they're going to go into the soft max,<01:10:12.800> right?<01:10:12.920> So,<01:10:13.000> in<01:10:13.080> the<01:10:13.160> standard" + }, + { + "start": 4213.83, + "duration": 0.0, + "text": "max, right? So, in the standard" + }, + { + "start": 4213.84, + "duration": 0.0, + "text": "max, right? So, in the standard attention<01:10:14.280> operation,<01:10:14.960> I'm<01:10:15.000> going<01:10:15.120> to<01:10:15.240> layer" + }, + { + "start": 4215.55, + "duration": 0.0, + "text": "attention operation, I'm going to layer" + }, + { + "start": 4215.56, + "duration": 0.0, + "text": "attention operation, I'm going to layer norm<01:10:15.840> as<01:10:16.000> a<01:10:16.040> pre-layer<01:10:16.560> norm,<01:10:17.280> multiply<01:10:17.840> with" + }, + { + "start": 4217.99, + "duration": 0.0, + "text": "norm as a pre-layer norm, multiply with" + }, + { + "start": 4218.0, + "duration": 0.0, + "text": "norm as a pre-layer norm, multiply with a<01:10:18.040> QKV,<01:10:18.920> and<01:10:19.040> then<01:10:19.120> I'm<01:10:19.200> going<01:10:19.320> to<01:10:19.360> get<01:10:19.480> my<01:10:19.600> Qs" + }, + { + "start": 4219.91, + "duration": 0.0, + "text": "a QKV, and then I'm going to get my Qs" + }, + { + "start": 4219.92, + "duration": 0.0, + "text": "a QKV, and then I'm going to get my Qs and<01:10:20.040> Ks.<01:10:20.680> Those<01:10:20.880> will<01:10:21.000> get<01:10:21.160> multiplied<01:10:21.760> by<01:10:21.920> a" + }, + { + "start": 4221.95, + "duration": 0.0, + "text": "and Ks. Those will get multiplied by a" + }, + { + "start": 4221.96, + "duration": 0.0, + "text": "and Ks. Those will get multiplied by a matrix<01:10:22.360> multiply,<01:10:22.840> I'll<01:10:22.960> soft<01:10:23.360> max<01:10:23.680> them,<01:10:24.200> and" + }, + { + "start": 4224.31, + "duration": 0.0, + "text": "matrix multiply, I'll soft max them, and" + }, + { + "start": 4224.32, + "duration": 0.0, + "text": "matrix multiply, I'll soft max them, and I'll<01:10:24.760> multiply<01:10:25.280> that<01:10:25.440> with<01:10:25.520> V<01:10:25.720> to<01:10:25.840> get<01:10:26.000> the" + }, + { + "start": 4226.07, + "duration": 0.0, + "text": "I'll multiply that with V to get the" + }, + { + "start": 4226.08, + "duration": 0.0, + "text": "I'll multiply that with V to get the weighted<01:10:26.400> average,<01:10:27.120> and<01:10:27.200> then<01:10:27.280> I'll<01:10:27.440> output" + }, + { + "start": 4227.75, + "duration": 0.0, + "text": "weighted average, and then I'll output" + }, + { + "start": 4227.76, + "duration": 0.0, + "text": "weighted average, and then I'll output whatever<01:10:28.320> comes<01:10:28.560> after,<01:10:28.880> right?<01:10:29.080> So,<01:10:29.160> this<01:10:29.320> is" + }, + { + "start": 4229.43, + "duration": 0.0, + "text": "whatever comes after, right? So, this is" + }, + { + "start": 4229.44, + "duration": 0.0, + "text": "whatever comes after, right? So, this is our<01:10:29.560> usual<01:10:30.080> attention.<01:10:31.000> Now,<01:10:31.240> what<01:10:31.480> happens" + }, + { + "start": 4232.23, + "duration": 0.0, + "text": "our usual attention. Now, what happens" + }, + { + "start": 4232.24, + "duration": 0.0, + "text": "our usual attention. Now, what happens if<01:10:32.360> we<01:10:32.520> just<01:10:32.920> throw<01:10:33.240> in<01:10:33.360> a<01:10:33.440> layer<01:10:33.760> norm<01:10:34.480> before" + }, + { + "start": 4235.35, + "duration": 0.0, + "text": "if we just throw in a layer norm before" + }, + { + "start": 4235.36, + "duration": 0.0, + "text": "if we just throw in a layer norm before we<01:10:35.480> multiply<01:10:36.040> the<01:10:36.160> Qs<01:10:36.720> and<01:10:36.880> Ks?<01:10:37.680> If<01:10:37.840> we<01:10:37.960> do" + }, + { + "start": 4238.19, + "duration": 0.0, + "text": "we multiply the Qs and Ks? If we do" + }, + { + "start": 4238.2, + "duration": 0.0, + "text": "we multiply the Qs and Ks? If we do that,<01:10:38.760> then<01:10:38.920> we<01:10:39.080> know<01:10:39.880> that<01:10:40.080> the<01:10:40.240> inputs<01:10:40.640> to" + }, + { + "start": 4240.75, + "duration": 0.0, + "text": "that, then we know that the inputs to" + }, + { + "start": 4240.76, + "duration": 0.0, + "text": "that, then we know that the inputs to this<01:10:40.960> matrix<01:10:41.320> multiply,<01:10:41.800> and<01:10:42.000> therefore<01:10:42.400> the" + }, + { + "start": 4242.51, + "duration": 0.0, + "text": "this matrix multiply, and therefore the" + }, + { + "start": 4242.52, + "duration": 0.0, + "text": "this matrix multiply, and therefore the inputs<01:10:42.840> to<01:10:42.920> the<01:10:43.000> soft<01:10:43.440> max,<01:10:44.280> roughly<01:10:44.960> have<01:10:45.280> the" + }, + { + "start": 4245.35, + "duration": 0.0, + "text": "inputs to the soft max, roughly have the" + }, + { + "start": 4245.36, + "duration": 0.0, + "text": "inputs to the soft max, roughly have the same<01:10:45.760> scale.<01:10:46.080> They're<01:10:46.280> always<01:10:46.640> going<01:10:46.760> to<01:10:46.840> have" + }, + { + "start": 4247.07, + "duration": 0.0, + "text": "same scale. They're always going to have" + }, + { + "start": 4247.08, + "duration": 0.0, + "text": "same scale. They're always going to have a<01:10:47.120> scale<01:10:47.520> of<01:10:47.680> roughly<01:10:48.080> one<01:10:48.800> because<01:10:49.040> we've," + }, + { + "start": 4249.47, + "duration": 0.0, + "text": "a scale of roughly one because we've," + }, + { + "start": 4249.48, + "duration": 0.0, + "text": "a scale of roughly one because we've, you<01:10:49.560> know,<01:10:50.160> used<01:10:50.440> RMS<01:10:50.840> norm<01:10:51.040> to<01:10:51.120> divide<01:10:51.920> the" + }, + { + "start": 4251.99, + "duration": 0.0, + "text": "you know, used RMS norm to divide the" + }, + { + "start": 4252.0, + "duration": 0.0, + "text": "you know, used RMS norm to divide the size<01:10:52.480> of<01:10:52.600> those<01:10:52.840> Qs<01:10:53.120> and<01:10:53.280> Ks." + }, + { + "start": 4254.63, + "duration": 0.0, + "text": "size of those Qs and Ks." + }, + { + "start": 4254.64, + "duration": 0.0, + "text": "size of those Qs and Ks. Okay.<01:10:55.360> If<01:10:55.560> we<01:10:55.680> do<01:10:55.920> that,<01:10:56.480> then,<01:10:57.000> you<01:10:57.080> know," + }, + { + "start": 4257.15, + "duration": 0.0, + "text": "Okay. If we do that, then, you know," + }, + { + "start": 4257.16, + "duration": 0.0, + "text": "Okay. If we do that, then, you know, we're<01:10:57.320> kind<01:10:57.520> of<01:10:57.600> going<01:10:57.800> to<01:10:58.000> keep<01:10:58.600> this<01:10:58.800> soft" + }, + { + "start": 4259.07, + "duration": 0.0, + "text": "we're kind of going to keep this soft" + }, + { + "start": 4259.08, + "duration": 0.0, + "text": "we're kind of going to keep this soft max<01:10:59.320> operation<01:10:59.760> stable.<01:11:00.320> Tons<01:11:00.680> of<01:11:00.800> different" + }, + { + "start": 4261.07, + "duration": 0.0, + "text": "max operation stable. Tons of different" + }, + { + "start": 4261.08, + "duration": 0.0, + "text": "max operation stable. Tons of different models<01:11:01.440> do<01:11:01.600> this.<01:11:01.800> It's<01:11:01.920> originally<01:11:02.520> from" + }, + { + "start": 4263.75, + "duration": 0.0, + "text": "models do this. It's originally from" + }, + { + "start": 4263.76, + "duration": 0.0, + "text": "models do this. It's originally from the<01:11:03.880> multimodal<01:11:04.600> world." + }, + { + "start": 4266.51, + "duration": 0.0, + "text": "the multimodal world." + }, + { + "start": 4266.52, + "duration": 0.0, + "text": "the multimodal world. You<01:11:06.600> know,<01:11:06.760> some<01:11:07.000> folks<01:11:07.400> who<01:11:07.520> were<01:11:07.880> doing" + }, + { + "start": 4268.23, + "duration": 0.0, + "text": "You know, some folks who were doing" + }, + { + "start": 4268.24, + "duration": 0.0, + "text": "You know, some folks who were doing making<01:11:08.520> multimodal<01:11:09.080> models<01:11:09.360> sort<01:11:09.480> of" + }, + { + "start": 4269.55, + "duration": 0.0, + "text": "making multimodal models sort of" + }, + { + "start": 4269.56, + "duration": 0.0, + "text": "making multimodal models sort of initially<01:11:09.920> discovered<01:11:10.440> QK<01:11:10.800> norm." + }, + { + "start": 4271.83, + "duration": 0.0, + "text": "initially discovered QK norm." + }, + { + "start": 4271.84, + "duration": 0.0, + "text": "initially discovered QK norm. E<01:11:11.960> to<01:11:12.080> fix<01:11:12.480> and<01:11:12.680> chameleon<01:11:13.360> really,<01:11:13.840> you<01:11:13.920> know," + }, + { + "start": 4274.35, + "duration": 0.0, + "text": "E to fix and chameleon really, you know," + }, + { + "start": 4274.36, + "duration": 0.0, + "text": "E to fix and chameleon really, you know, used<01:11:14.680> this<01:11:15.040> and<01:11:15.160> like<01:11:15.320> proved<01:11:15.640> it<01:11:15.760> out.<01:11:16.240> And" + }, + { + "start": 4276.31, + "duration": 0.0, + "text": "used this and like proved it out. And" + }, + { + "start": 4276.32, + "duration": 0.0, + "text": "used this and like proved it out. And then<01:11:16.440> a<01:11:16.480> number<01:11:16.880> of<01:11:17.000> other<01:11:17.840> open-source" + }, + { + "start": 4278.39, + "duration": 0.0, + "text": "then a number of other open-source" + }, + { + "start": 4278.4, + "duration": 0.0, + "text": "then a number of other open-source language<01:11:18.760> models,<01:11:19.320> you<01:11:19.440> know,<01:11:19.560> realized<01:11:20.080> that" + }, + { + "start": 4280.15, + "duration": 0.0, + "text": "language models, you know, realized that" + }, + { + "start": 4280.16, + "duration": 0.0, + "text": "language models, you know, realized that the<01:11:20.240> same<01:11:20.480> tricks<01:11:20.800> are<01:11:20.920> entirely<01:11:21.560> applicable" + }, + { + "start": 4282.47, + "duration": 0.0, + "text": "the same tricks are entirely applicable" + }, + { + "start": 4282.48, + "duration": 0.0, + "text": "the same tricks are entirely applicable to" + }, + { + "start": 4283.71, + "duration": 0.0, + "text": "to" + }, + { + "start": 4283.72, + "duration": 0.0, + "text": "to stabilizing<01:11:24.440> attention<01:11:25.000> for<01:11:25.080> language" + }, + { + "start": 4285.43, + "duration": 0.0, + "text": "stabilizing attention for language" + }, + { + "start": 4285.44, + "duration": 0.0, + "text": "stabilizing attention for language models,<01:11:26.040> and<01:11:26.120> I<01:11:26.200> think<01:11:26.520> this<01:11:26.640> is<01:11:26.760> now<01:11:26.920> very," + }, + { + "start": 4287.19, + "duration": 0.0, + "text": "models, and I think this is now very," + }, + { + "start": 4287.2, + "duration": 0.0, + "text": "models, and I think this is now very, very<01:11:27.440> standard.<01:11:28.000> Like<01:11:28.120> QK<01:11:28.440> norm<01:11:28.640> is<01:11:28.760> actually" + }, + { + "start": 4289.03, + "duration": 0.0, + "text": "very standard. Like QK norm is actually" + }, + { + "start": 4289.04, + "duration": 0.0, + "text": "very standard. Like QK norm is actually a<01:11:29.080> very<01:11:29.360> standard<01:11:29.760> intervention<01:11:30.600> that<01:11:30.800> most" + }, + { + "start": 4291.15, + "duration": 0.0, + "text": "a very standard intervention that most" + }, + { + "start": 4291.16, + "duration": 0.0, + "text": "a very standard intervention that most of<01:11:31.240> the<01:11:31.360> large<01:11:31.640> models<01:11:32.040> now<01:11:32.280> introduce.<01:11:33.320> Um<01:11:34.040> it" + }, + { + "start": 4294.19, + "duration": 0.0, + "text": "of the large models now introduce. Um it" + }, + { + "start": 4294.2, + "duration": 0.0, + "text": "of the large models now introduce. Um it doesn't<01:11:34.520> seem<01:11:34.680> to<01:11:34.760> affect<01:11:35.040> performance<01:11:36.280> um" + }, + { + "start": 4296.35, + "duration": 0.0, + "text": "doesn't seem to affect performance um" + }, + { + "start": 4296.36, + "duration": 0.0, + "text": "doesn't seem to affect performance um from<01:11:36.640> lots<01:11:36.840> of<01:11:36.920> different<01:11:37.160> training<01:11:37.480> runs," + }, + { + "start": 4298.07, + "duration": 0.0, + "text": "from lots of different training runs," + }, + { + "start": 4298.08, + "duration": 0.0, + "text": "from lots of different training runs, but<01:11:38.200> it<01:11:38.320> does<01:11:38.960> definitely<01:11:39.480> prevent<01:11:40.160> the<01:11:40.240> kinds" + }, + { + "start": 4300.55, + "duration": 0.0, + "text": "but it does definitely prevent the kinds" + }, + { + "start": 4300.56, + "duration": 0.0, + "text": "but it does definitely prevent the kinds of<01:11:41.160> um<01:11:41.240> attention<01:11:41.560> degeneracies.<01:11:42.720> Um<01:11:43.360> and," + }, + { + "start": 4303.55, + "duration": 0.0, + "text": "of um attention degeneracies. Um and," + }, + { + "start": 4303.56, + "duration": 0.0, + "text": "of um attention degeneracies. Um and, you<01:11:43.640> know,<01:11:43.760> I<01:11:43.920> I'm<01:11:44.040> really" + }, + { + "start": 4304.699, + "duration": 0.0, + "text": "you know, I I'm really" + }, + { + "start": 4304.709, + "duration": 0.0, + "text": "you know, I I'm really >> [laughter]" + }, + { + "start": 4305.23, + "duration": 0.0, + "text": ">> [laughter]" + }, + { + "start": 4305.24, + "duration": 0.0, + "text": ">> [laughter] >> the<01:11:45.520> the<01:11:45.800> way<01:11:46.000> that<01:11:46.160> I've<01:11:46.240> seen<01:11:46.520> this<01:11:46.800> is,<01:11:47.240> you" + }, + { + "start": 4307.31, + "duration": 0.0, + "text": ">> the the way that I've seen this is, you" + }, + { + "start": 4307.32, + "duration": 0.0, + "text": ">> the the way that I've seen this is, you know,<01:11:47.440> we<01:11:47.560> have<01:11:47.800> layer<01:11:48.000> norms<01:11:48.720> initially<01:11:49.240> in" + }, + { + "start": 4309.31, + "duration": 0.0, + "text": "know, we have layer norms initially in" + }, + { + "start": 4309.32, + "duration": 0.0, + "text": "know, we have layer norms initially in the<01:11:49.400> pre-norm.<01:11:50.080> Now,<01:11:50.280> we<01:11:50.480> add<01:11:50.760> them<01:11:50.920> after<01:11:51.280> the" + }, + { + "start": 4311.75, + "duration": 0.0, + "text": "the pre-norm. Now, we add them after the" + }, + { + "start": 4311.76, + "duration": 0.0, + "text": "the pre-norm. Now, we add them after the the" + }, + { + "start": 4312.63, + "duration": 0.0, + "text": "the" + }, + { + "start": 4312.64, + "duration": 0.0, + "text": "the nonlinearities<01:11:53.560> in<01:11:53.640> each<01:11:53.800> block,<01:11:54.360> and<01:11:54.480> now" + }, + { + "start": 4314.59, + "duration": 0.0, + "text": "nonlinearities in each block, and now" + }, + { + "start": 4314.6, + "duration": 0.0, + "text": "nonlinearities in each block, and now we're<01:11:54.720> throwing<01:11:55.040> them<01:11:55.200> in<01:11:55.360> both<01:11:55.680> the<01:11:55.760> Qs<01:11:56.160> and" + }, + { + "start": 4316.27, + "duration": 0.0, + "text": "we're throwing them in both the Qs and" + }, + { + "start": 4316.28, + "duration": 0.0, + "text": "we're throwing them in both the Qs and the<01:11:56.360> Ks.<01:11:56.640> And<01:11:56.760> really,<01:11:57.040> I<01:11:57.120> think<01:11:57.320> this<01:11:57.480> is<01:11:57.720> is" + }, + { + "start": 4317.83, + "duration": 0.0, + "text": "the Ks. And really, I think this is is" + }, + { + "start": 4317.84, + "duration": 0.0, + "text": "the Ks. And really, I think this is is is<01:11:57.920> kind<01:11:58.080> of<01:11:58.200> getting<01:11:58.520> at" + }, + { + "start": 4320.03, + "duration": 0.0, + "text": "is kind of getting at" + }, + { + "start": 4320.04, + "duration": 0.0, + "text": "is kind of getting at the<01:12:00.200> stabilization<01:12:01.000> tricks<01:12:01.280> that<01:12:01.440> people" + }, + { + "start": 4321.67, + "duration": 0.0, + "text": "the stabilization tricks that people" + }, + { + "start": 4321.68, + "duration": 0.0, + "text": "the stabilization tricks that people apply<01:12:01.960> to<01:12:02.080> this<01:12:02.240> world.<01:12:02.960> Okay." + }, + { + "start": 4324.07, + "duration": 0.0, + "text": "apply to this world. Okay." + }, + { + "start": 4324.08, + "duration": 0.0, + "text": "apply to this world. Okay. Now," + }, + { + "start": 4325.11, + "duration": 0.0, + "text": "Now," + }, + { + "start": 4325.12, + "duration": 0.0, + "text": "Now, um<01:12:05.280> the<01:12:05.440> final<01:12:05.880> set<01:12:06.080> of<01:12:06.200> things<01:12:06.560> that<01:12:06.720> I'll" + }, + { + "start": 4326.83, + "duration": 0.0, + "text": "um the final set of things that I'll" + }, + { + "start": 4326.84, + "duration": 0.0, + "text": "um the final set of things that I'll talk<01:12:07.120> about<01:12:07.400> as<01:12:07.520> a<01:12:07.560> stability<01:12:08.040> intervention." + }, + { + "start": 4328.75, + "duration": 0.0, + "text": "talk about as a stability intervention." + }, + { + "start": 4328.76, + "duration": 0.0, + "text": "talk about as a stability intervention. And<01:12:09.000> frankly,<01:12:09.480> this<01:12:09.720> one<01:12:10.440> is<01:12:10.920> not<01:12:11.280> as<01:12:11.440> popular" + }, + { + "start": 4331.99, + "duration": 0.0, + "text": "And frankly, this one is not as popular" + }, + { + "start": 4332.0, + "duration": 0.0, + "text": "And frankly, this one is not as popular and<01:12:12.200> more<01:12:12.480> of<01:12:12.680> a<01:12:13.080> of<01:12:13.240> a<01:12:13.440> Google-specific<01:12:14.480> trick" + }, + { + "start": 4334.87, + "duration": 0.0, + "text": "and more of a of a Google-specific trick" + }, + { + "start": 4334.88, + "duration": 0.0, + "text": "and more of a of a Google-specific trick that<01:12:15.000> I've<01:12:15.120> seen.<01:12:16.000> Um<01:12:16.640> but<01:12:17.480> uh<01:12:17.560> logit<01:12:17.960> soft" + }, + { + "start": 4338.31, + "duration": 0.0, + "text": "that I've seen. Um but uh logit soft" + }, + { + "start": 4338.32, + "duration": 0.0, + "text": "that I've seen. Um but uh logit soft capping<01:12:18.680> is<01:12:18.760> a<01:12:18.800> much<01:12:19.040> harder<01:12:19.520> intervention" + }, + { + "start": 4340.39, + "duration": 0.0, + "text": "capping is a much harder intervention" + }, + { + "start": 4340.4, + "duration": 0.0, + "text": "capping is a much harder intervention that<01:12:20.560> some<01:12:20.760> people<01:12:21.040> apply.<01:12:21.400> So,<01:12:21.560> this<01:12:21.800> one,<01:12:22.680> um" + }, + { + "start": 4343.11, + "duration": 0.0, + "text": "that some people apply. So, this one, um" + }, + { + "start": 4343.12, + "duration": 0.0, + "text": "that some people apply. So, this one, um you<01:12:23.200> know,<01:12:23.400> in<01:12:23.560> QK<01:12:23.960> norm,<01:12:24.640> what<01:12:24.800> we're<01:12:24.920> doing" + }, + { + "start": 4345.59, + "duration": 0.0, + "text": "you know, in QK norm, what we're doing" + }, + { + "start": 4345.6, + "duration": 0.0, + "text": "you know, in QK norm, what we're doing is<01:12:25.760> we<01:12:25.880> are<01:12:26.000> controlling<01:12:26.520> the<01:12:26.680> inputs<01:12:27.160> to<01:12:27.200> the" + }, + { + "start": 4347.27, + "duration": 0.0, + "text": "is we are controlling the inputs to the" + }, + { + "start": 4347.28, + "duration": 0.0, + "text": "is we are controlling the inputs to the soft<01:12:27.640> max<01:12:28.280> and<01:12:28.360> sort<01:12:28.480> of<01:12:28.600> hoping<01:12:29.000> that<01:12:29.160> the" + }, + { + "start": 4349.27, + "duration": 0.0, + "text": "soft max and sort of hoping that the" + }, + { + "start": 4349.28, + "duration": 0.0, + "text": "soft max and sort of hoping that the outputs<01:12:29.600> are<01:12:29.680> well-behaved.<01:12:30.720> If<01:12:30.880> we<01:12:31.040> really," + }, + { + "start": 4351.51, + "duration": 0.0, + "text": "outputs are well-behaved. If we really," + }, + { + "start": 4351.52, + "duration": 0.0, + "text": "outputs are well-behaved. If we really, really<01:12:31.880> want<01:12:32.040> to<01:12:32.120> enforce" + }, + { + "start": 4353.27, + "duration": 0.0, + "text": "really want to enforce" + }, + { + "start": 4353.28, + "duration": 0.0, + "text": "really want to enforce um<01:12:33.760> well-behaved<01:12:34.520> outputs,<01:12:35.160> what<01:12:35.280> we<01:12:35.400> can<01:12:35.560> do" + }, + { + "start": 4356.15, + "duration": 0.0, + "text": "um well-behaved outputs, what we can do" + }, + { + "start": 4356.16, + "duration": 0.0, + "text": "um well-behaved outputs, what we can do is<01:12:36.280> we<01:12:36.360> can<01:12:36.480> kind<01:12:36.600> of<01:12:36.720> take<01:12:36.880> the<01:12:37.000> logits,<01:12:37.560> the" + }, + { + "start": 4357.63, + "duration": 0.0, + "text": "is we can kind of take the logits, the" + }, + { + "start": 4357.64, + "duration": 0.0, + "text": "is we can kind of take the logits, the things<01:12:37.880> that<01:12:38.000> go<01:12:38.160> straight<01:12:38.600> into<01:12:38.760> the<01:12:38.880> soft" + }, + { + "start": 4359.15, + "duration": 0.0, + "text": "things that go straight into the soft" + }, + { + "start": 4359.16, + "duration": 0.0, + "text": "things that go straight into the soft max,<01:12:39.720> and<01:12:39.800> we<01:12:39.880> can<01:12:39.960> just<01:12:40.200> cap<01:12:40.560> them<01:12:40.720> off<01:12:40.920> so" + }, + { + "start": 4360.99, + "duration": 0.0, + "text": "max, and we can just cap them off so" + }, + { + "start": 4361.0, + "duration": 0.0, + "text": "max, and we can just cap them off so they<01:12:41.120> can<01:12:41.320> never<01:12:41.680> be<01:12:41.840> too<01:12:42.080> large<01:12:42.720> or<01:12:42.840> too" + }, + { + "start": 4363.03, + "duration": 0.0, + "text": "they can never be too large or too" + }, + { + "start": 4363.04, + "duration": 0.0, + "text": "they can never be too large or too small,<01:12:43.440> right?<01:12:43.640> This<01:12:43.760> is<01:12:43.880> a<01:12:43.960> hard<01:12:44.320> almost<01:12:44.640> a" + }, + { + "start": 4364.67, + "duration": 0.0, + "text": "small, right? This is a hard almost a" + }, + { + "start": 4364.68, + "duration": 0.0, + "text": "small, right? This is a hard almost a hard<01:12:44.880> constraint.<01:12:45.720> Um<01:12:45.840> it's<01:12:45.960> called<01:12:46.160> a<01:12:46.200> soft" + }, + { + "start": 4366.55, + "duration": 0.0, + "text": "hard constraint. Um it's called a soft" + }, + { + "start": 4366.56, + "duration": 0.0, + "text": "hard constraint. Um it's called a soft cap,<01:12:46.800> of<01:12:46.920> course,<01:12:47.120> but<01:12:47.240> a<01:12:47.320> tan<01:12:47.560> H,<01:12:47.760> you<01:12:47.800> know," + }, + { + "start": 4367.91, + "duration": 0.0, + "text": "cap, of course, but a tan H, you know," + }, + { + "start": 4367.92, + "duration": 0.0, + "text": "cap, of course, but a tan H, you know, is<01:12:48.040> bounded<01:12:48.800> at<01:12:48.960> some<01:12:49.160> value." + }, + { + "start": 4370.47, + "duration": 0.0, + "text": "is bounded at some value." + }, + { + "start": 4370.48, + "duration": 0.0, + "text": "is bounded at some value. Um<01:12:51.240> and<01:12:51.440> so<01:12:52.120> uh<01:12:52.200> this<01:12:52.400> is<01:12:52.560> in<01:12:52.800> the<01:12:52.960> Gemma" + }, + { + "start": 4373.23, + "duration": 0.0, + "text": "Um and so uh this is in the Gemma" + }, + { + "start": 4373.24, + "duration": 0.0, + "text": "Um and so uh this is in the Gemma models.<01:12:53.920> Um<01:12:54.080> I<01:12:54.160> think<01:12:54.360> both<01:12:54.960> Gemma<01:12:55.400> or<01:12:55.560> like" + }, + { + "start": 4375.83, + "duration": 0.0, + "text": "models. Um I think both Gemma or like" + }, + { + "start": 4375.84, + "duration": 0.0, + "text": "models. Um I think both Gemma or like Gemma's<01:12:56.320> two,<01:12:56.560> three,<01:12:56.880> and<01:12:57.000> four<01:12:57.920> all<01:12:58.200> use<01:12:58.400> the" + }, + { + "start": 4378.47, + "duration": 0.0, + "text": "Gemma's two, three, and four all use the" + }, + { + "start": 4378.48, + "duration": 0.0, + "text": "Gemma's two, three, and four all use the logit<01:12:58.840> soft<01:12:59.200> cap<01:12:59.600> trick.<01:13:00.560> Um<01:13:00.800> and<01:13:00.920> what<01:13:01.040> they" + }, + { + "start": 4381.19, + "duration": 0.0, + "text": "logit soft cap trick. Um and what they" + }, + { + "start": 4381.2, + "duration": 0.0, + "text": "logit soft cap trick. Um and what they do<01:13:01.840> is<01:13:02.080> they<01:13:02.280> take<01:13:02.600> all<01:13:02.760> of<01:13:02.840> their<01:13:03.040> logits<01:13:03.640> from" + }, + { + "start": 4383.79, + "duration": 0.0, + "text": "do is they take all of their logits from" + }, + { + "start": 4383.8, + "duration": 0.0, + "text": "do is they take all of their logits from the<01:13:03.880> attention<01:13:04.280> layers,<01:13:04.960> and<01:13:05.080> then<01:13:05.240> they<01:13:05.360> soft" + }, + { + "start": 4385.75, + "duration": 0.0, + "text": "the attention layers, and then they soft" + }, + { + "start": 4385.76, + "duration": 0.0, + "text": "the attention layers, and then they soft cap<01:13:06.000> them<01:13:06.200> at<01:13:06.320> some<01:13:06.440> value." + }, + { + "start": 4387.87, + "duration": 0.0, + "text": "cap them at some value." + }, + { + "start": 4387.88, + "duration": 0.0, + "text": "cap them at some value. Um<01:13:08.640> some<01:13:08.800> Nvidia<01:13:09.240> folks<01:13:09.560> have<01:13:09.680> done<01:13:09.880> actually" + }, + { + "start": 4390.19, + "duration": 0.0, + "text": "Um some Nvidia folks have done actually" + }, + { + "start": 4390.2, + "duration": 0.0, + "text": "Um some Nvidia folks have done actually quite<01:13:10.480> nice<01:13:10.800> work<01:13:11.160> doing<01:13:11.360> systematic" + }, + { + "start": 4391.87, + "duration": 0.0, + "text": "quite nice work doing systematic" + }, + { + "start": 4391.88, + "duration": 0.0, + "text": "quite nice work doing systematic comparisons<01:13:12.520> of<01:13:12.640> these<01:13:12.760> stability" + }, + { + "start": 4393.19, + "duration": 0.0, + "text": "comparisons of these stability" + }, + { + "start": 4393.2, + "duration": 0.0, + "text": "comparisons of these stability interventions.<01:13:14.320> Um<01:13:14.960> and<01:13:15.080> what<01:13:15.200> they<01:13:15.360> find<01:13:16.120> is" + }, + { + "start": 4397.03, + "duration": 0.0, + "text": "interventions. Um and what they find is" + }, + { + "start": 4397.04, + "duration": 0.0, + "text": "interventions. Um and what they find is um<01:13:17.240> if<01:13:17.400> you<01:13:17.520> start<01:13:17.960> with<01:13:18.080> a<01:13:18.160> baseline<01:13:18.680> model," + }, + { + "start": 4399.43, + "duration": 0.0, + "text": "um if you start with a baseline model," + }, + { + "start": 4399.44, + "duration": 0.0, + "text": "um if you start with a baseline model, you<01:13:19.560> can<01:13:19.720> do<01:13:19.880> all<01:13:20.000> sorts<01:13:20.200> of<01:13:20.280> different" + }, + { + "start": 4400.55, + "duration": 0.0, + "text": "you can do all sorts of different" + }, + { + "start": 4400.56, + "duration": 0.0, + "text": "you can do all sorts of different interventions,<01:13:21.280> and<01:13:21.680> QK<01:13:22.080> norm<01:13:22.840> is<01:13:23.360> here,<01:13:23.720> and" + }, + { + "start": 4403.83, + "duration": 0.0, + "text": "interventions, and QK norm is here, and" + }, + { + "start": 4403.84, + "duration": 0.0, + "text": "interventions, and QK norm is here, and it<01:13:23.960> does<01:13:24.160> slightly<01:13:24.640> better<01:13:24.960> due<01:13:25.160> to<01:13:25.240> the<01:13:25.320> fact" + }, + { + "start": 4405.55, + "duration": 0.0, + "text": "it does slightly better due to the fact" + }, + { + "start": 4405.56, + "duration": 0.0, + "text": "it does slightly better due to the fact that<01:13:25.640> you<01:13:25.720> can<01:13:25.840> crank<01:13:26.240> up<01:13:26.360> the<01:13:26.440> learning<01:13:26.840> rate" + }, + { + "start": 4406.95, + "duration": 0.0, + "text": "that you can crank up the learning rate" + }, + { + "start": 4406.96, + "duration": 0.0, + "text": "that you can crank up the learning rate a<01:13:27.000> little<01:13:27.240> bit.<01:13:27.960> Um<01:13:28.760> but<01:13:28.920> if<01:13:29.040> you<01:13:29.120> do<01:13:29.280> soft" + }, + { + "start": 4409.63, + "duration": 0.0, + "text": "a little bit. Um but if you do soft" + }, + { + "start": 4409.64, + "duration": 0.0, + "text": "a little bit. Um but if you do soft capping<01:13:30.000> alone,<01:13:30.720> you<01:13:30.880> actually<01:13:31.200> end<01:13:31.400> up" + }, + { + "start": 4411.55, + "duration": 0.0, + "text": "capping alone, you actually end up" + }, + { + "start": 4411.56, + "duration": 0.0, + "text": "capping alone, you actually end up losing<01:13:31.920> performance.<01:13:32.440> So,<01:13:32.560> there<01:13:32.800> is<01:13:33.080> a<01:13:33.320> a" + }, + { + "start": 4413.39, + "duration": 0.0, + "text": "losing performance. So, there is a a" + }, + { + "start": 4413.4, + "duration": 0.0, + "text": "losing performance. So, there is a a quality<01:13:33.920> degradation<01:13:34.440> that<01:13:34.560> happens.<01:13:34.920> This" + }, + { + "start": 4415.03, + "duration": 0.0, + "text": "quality degradation that happens. This" + }, + { + "start": 4415.04, + "duration": 0.0, + "text": "quality degradation that happens. This is<01:13:35.120> a<01:13:35.160> very<01:13:35.360> strong<01:13:35.760> intervention.<01:13:36.240> You<01:13:36.320> can" + }, + { + "start": 4416.47, + "duration": 0.0, + "text": "is a very strong intervention. You can" + }, + { + "start": 4416.48, + "duration": 0.0, + "text": "is a very strong intervention. You can never<01:13:36.800> express<01:13:37.240> very<01:13:37.480> confident<01:13:38.560> uh<01:13:38.640> signals" + }, + { + "start": 4418.99, + "duration": 0.0, + "text": "never express very confident uh signals" + }, + { + "start": 4419.0, + "duration": 0.0, + "text": "never express very confident uh signals in<01:13:39.080> your<01:13:39.200> soft<01:13:39.520> max<01:13:39.880> beyond<01:13:40.120> a<01:13:40.160> certain<01:13:40.440> point." + }, + { + "start": 4421.39, + "duration": 0.0, + "text": "in your soft max beyond a certain point." + }, + { + "start": 4421.4, + "duration": 0.0, + "text": "in your soft max beyond a certain point. Um<01:13:41.680> so,<01:13:41.800> it<01:13:41.880> does<01:13:42.080> have<01:13:42.240> some<01:13:42.560> negative" + }, + { + "start": 4422.99, + "duration": 0.0, + "text": "Um so, it does have some negative" + }, + { + "start": 4423.0, + "duration": 0.0, + "text": "Um so, it does have some negative consequences,<01:13:43.760> but<01:13:43.880> this<01:13:44.000> is<01:13:44.080> a<01:13:44.120> very<01:13:44.400> safe" + }, + { + "start": 4424.79, + "duration": 0.0, + "text": "consequences, but this is a very safe" + }, + { + "start": 4424.8, + "duration": 0.0, + "text": "consequences, but this is a very safe way<01:13:45.400> of<01:13:45.560> stabilizing<01:13:46.680> the<01:13:46.840> outputs<01:13:47.240> of<01:13:47.360> your" + }, + { + "start": 4427.47, + "duration": 0.0, + "text": "way of stabilizing the outputs of your" + }, + { + "start": 4427.48, + "duration": 0.0, + "text": "way of stabilizing the outputs of your attention.<01:13:48.360> Or<01:13:48.520> sorry,<01:13:48.680> the<01:13:48.840> the<01:13:49.000> inputs<01:13:49.360> to" + }, + { + "start": 4429.39, + "duration": 0.0, + "text": "attention. Or sorry, the the inputs to" + }, + { + "start": 4429.4, + "duration": 0.0, + "text": "attention. Or sorry, the the inputs to your<01:13:49.520> attention,<01:13:49.840> the<01:13:49.960> logits<01:13:50.400> that<01:13:50.560> go<01:13:50.680> into" + }, + { + "start": 4430.83, + "duration": 0.0, + "text": "your attention, the logits that go into" + }, + { + "start": 4430.84, + "duration": 0.0, + "text": "your attention, the logits that go into the<01:13:50.920> soft<01:13:51.200> max." + }, + { + "start": 4432.55, + "duration": 0.0, + "text": "the soft max." + }, + { + "start": 4432.56, + "duration": 0.0, + "text": "the soft max. Okay." + }, + { + "start": 4433.47, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 4433.48, + "duration": 0.0, + "text": "Okay. So,<01:13:53.960> that's<01:13:54.160> kind<01:13:54.320> of<01:13:54.400> the<01:13:54.480> end<01:13:54.640> of<01:13:54.720> the" + }, + { + "start": 4434.79, + "duration": 0.0, + "text": "So, that's kind of the end of the" + }, + { + "start": 4434.8, + "duration": 0.0, + "text": "So, that's kind of the end of the stability<01:13:55.520> components.<01:13:56.560> Um<01:13:56.720> I<01:13:56.760> can<01:13:56.920> pause<01:13:57.280> for" + }, + { + "start": 4437.39, + "duration": 0.0, + "text": "stability components. Um I can pause for" + }, + { + "start": 4437.4, + "duration": 0.0, + "text": "stability components. Um I can pause for a<01:13:57.440> moment<01:13:57.800> here.<01:13:58.680> Um<01:13:59.200> and<01:13:59.320> I'll<01:13:59.400> talk<01:13:59.680> about" + }, + { + "start": 4439.95, + "duration": 0.0, + "text": "a moment here. Um and I'll talk about" + }, + { + "start": 4439.96, + "duration": 0.0, + "text": "a moment here. Um and I'll talk about sort<01:14:00.080> of<01:14:00.240> various<01:14:00.560> attention<01:14:01.000> things<01:14:01.920> um" + }, + { + "start": 4441.99, + "duration": 0.0, + "text": "sort of various attention things um" + }, + { + "start": 4442.0, + "duration": 0.0, + "text": "sort of various attention things um after<01:14:02.280> that." + }, + { + "start": 4446.4, + "duration": 0.0, + "text": "Cool." + }, + { + "start": 4447.31, + "duration": 0.0, + "text": "Cool." + }, + { + "start": 4447.32, + "duration": 0.0, + "text": "Cool. Okay." + }, + { + "start": 4448.83, + "duration": 0.0, + "text": "Okay." + }, + { + "start": 4448.84, + "duration": 0.0, + "text": "Okay. All<01:14:09.000> right.<01:14:09.200> So," + }, + { + "start": 4450.31, + "duration": 0.0, + "text": "All right. So," + }, + { + "start": 4450.32, + "duration": 0.0, + "text": "All right. So, the<01:14:10.440> last<01:14:10.720> thing<01:14:10.840> I<01:14:10.880> want<01:14:11.000> to<01:14:11.080> talk<01:14:11.280> about" + }, + { + "start": 4451.51, + "duration": 0.0, + "text": "the last thing I want to talk about" + }, + { + "start": 4451.52, + "duration": 0.0, + "text": "the last thing I want to talk about today<01:14:11.960> is<01:14:12.360> various<01:14:12.760> interventions<01:14:13.280> that<01:14:13.480> you" + }, + { + "start": 4453.55, + "duration": 0.0, + "text": "today is various interventions that you" + }, + { + "start": 4453.56, + "duration": 0.0, + "text": "today is various interventions that you can<01:14:13.680> make<01:14:13.840> to<01:14:13.920> your<01:14:14.000> attention<01:14:14.480> head." + }, + { + "start": 4455.15, + "duration": 0.0, + "text": "can make to your attention head." + }, + { + "start": 4455.16, + "duration": 0.0, + "text": "can make to your attention head. Um<01:14:15.680> and<01:14:15.800> as<01:14:15.920> I<01:14:16.000> was<01:14:16.160> saying<01:14:16.360> at<01:14:16.440> the<01:14:16.560> beginning" + }, + { + "start": 4456.99, + "duration": 0.0, + "text": "Um and as I was saying at the beginning" + }, + { + "start": 4457.0, + "duration": 0.0, + "text": "Um and as I was saying at the beginning of<01:14:17.120> this<01:14:17.280> lecture," + }, + { + "start": 4458.31, + "duration": 0.0, + "text": "of this lecture," + }, + { + "start": 4458.32, + "duration": 0.0, + "text": "of this lecture, um<01:14:18.640> I'm<01:14:18.840> only<01:14:19.120> going<01:14:19.240> to<01:14:19.320> talk<01:14:19.600> about<01:14:20.040> all<01:14:20.160> the" + }, + { + "start": 4460.27, + "duration": 0.0, + "text": "um I'm only going to talk about all the" + }, + { + "start": 4460.28, + "duration": 0.0, + "text": "um I'm only going to talk about all the things<01:14:20.560> that<01:14:20.680> you<01:14:20.760> can<01:14:20.920> do<01:14:21.600> to<01:14:21.840> sort<01:14:22.000> of<01:14:22.160> dense" + }, + { + "start": 4462.55, + "duration": 0.0, + "text": "things that you can do to sort of dense" + }, + { + "start": 4462.56, + "duration": 0.0, + "text": "things that you can do to sort of dense all<01:14:22.760> by<01:14:23.000> all<01:14:23.160> attention<01:14:23.600> today.<01:14:23.840> So,<01:14:23.960> if<01:14:24.080> you" + }, + { + "start": 4464.27, + "duration": 0.0, + "text": "all by all attention today. So, if you" + }, + { + "start": 4464.28, + "duration": 0.0, + "text": "all by all attention today. So, if you if<01:14:24.480> you<01:14:24.560> were<01:14:24.720> interested<01:14:25.200> in<01:14:25.320> hearing<01:14:25.560> about" + }, + { + "start": 4465.79, + "duration": 0.0, + "text": "if you were interested in hearing about" + }, + { + "start": 4465.8, + "duration": 0.0, + "text": "if you were interested in hearing about state<01:14:26.040> space<01:14:26.320> models<01:14:26.880> or<01:14:27.000> linear<01:14:27.320> time" + }, + { + "start": 4467.51, + "duration": 0.0, + "text": "state space models or linear time" + }, + { + "start": 4467.52, + "duration": 0.0, + "text": "state space models or linear time attention,<01:14:28.480> um<01:14:28.680> sadly<01:14:29.080> today<01:14:29.280> is<01:14:29.400> not<01:14:29.560> the<01:14:29.680> day" + }, + { + "start": 4469.83, + "duration": 0.0, + "text": "attention, um sadly today is not the day" + }, + { + "start": 4469.84, + "duration": 0.0, + "text": "attention, um sadly today is not the day for<01:14:30.000> you." + }, + { + "start": 4470.83, + "duration": 0.0, + "text": "for you." + }, + { + "start": 4470.84, + "duration": 0.0, + "text": "for you. Um<01:14:31.640> the<01:14:31.760> things<01:14:32.000> that<01:14:32.120> I<01:14:32.200> do<01:14:32.400> want<01:14:32.520> to<01:14:32.600> talk" + }, + { + "start": 4472.83, + "duration": 0.0, + "text": "Um the things that I do want to talk" + }, + { + "start": 4472.84, + "duration": 0.0, + "text": "Um the things that I do want to talk about,<01:14:33.320> which<01:14:33.520> are<01:14:33.640> really<01:14:33.960> commonly" + }, + { + "start": 4474.55, + "duration": 0.0, + "text": "about, which are really commonly" + }, + { + "start": 4474.56, + "duration": 0.0, + "text": "about, which are really commonly implemented<01:14:35.800> um<01:14:36.160> attention<01:14:36.640> interventions" + }, + { + "start": 4477.19, + "duration": 0.0, + "text": "implemented um attention interventions" + }, + { + "start": 4477.2, + "duration": 0.0, + "text": "implemented um attention interventions today,<01:14:38.000> are<01:14:38.600> uh<01:14:38.640> group<01:14:39.320> query<01:14:39.520> attention," + }, + { + "start": 4480.07, + "duration": 0.0, + "text": "today, are uh group query attention," + }, + { + "start": 4480.08, + "duration": 0.0, + "text": "today, are uh group query attention, which<01:14:40.320> really<01:14:40.520> saves<01:14:40.880> inference<01:14:41.280> cost<01:14:41.720> by" + }, + { + "start": 4481.83, + "duration": 0.0, + "text": "which really saves inference cost by" + }, + { + "start": 4481.84, + "duration": 0.0, + "text": "which really saves inference cost by reducing<01:14:42.360> the<01:14:42.440> number<01:14:42.720> of<01:14:42.840> heads,<01:14:43.640> um<01:14:44.400> and" + }, + { + "start": 4484.63, + "duration": 0.0, + "text": "reducing the number of heads, um and" + }, + { + "start": 4484.64, + "duration": 0.0, + "text": "reducing the number of heads, um and sparse<01:14:45.080> or<01:14:45.160> sliding<01:14:45.560> window<01:14:45.840> attention,<01:14:46.760> um" + }, + { + "start": 4486.79, + "duration": 0.0, + "text": "sparse or sliding window attention, um" + }, + { + "start": 4486.8, + "duration": 0.0, + "text": "sparse or sliding window attention, um which<01:14:47.200> really<01:14:47.640> originally<01:14:48.160> came<01:14:48.360> from<01:14:48.480> the" + }, + { + "start": 4488.59, + "duration": 0.0, + "text": "which really originally came from the" + }, + { + "start": 4488.6, + "duration": 0.0, + "text": "which really originally came from the GPT-3-ish" + }, + { + "start": 4490.15, + "duration": 0.0, + "text": "GPT-3-ish" + }, + { + "start": 4490.16, + "duration": 0.0, + "text": "GPT-3-ish family,<01:14:50.920> but<01:14:51.040> have<01:14:51.240> now<01:14:51.520> really<01:14:51.800> been<01:14:51.960> adopted" + }, + { + "start": 4492.51, + "duration": 0.0, + "text": "family, but have now really been adopted" + }, + { + "start": 4492.52, + "duration": 0.0, + "text": "family, but have now really been adopted widely<01:14:53.400> by<01:14:53.920> most<01:14:54.480> models<01:14:55.360> that<01:14:55.520> are<01:14:55.640> looking" + }, + { + "start": 4495.91, + "duration": 0.0, + "text": "widely by most models that are looking" + }, + { + "start": 4495.92, + "duration": 0.0, + "text": "widely by most models that are looking to<01:14:56.040> do<01:14:56.400> uh<01:14:56.480> long<01:14:56.760> context<01:14:57.240> unless<01:14:57.480> they're" + }, + { + "start": 4497.59, + "duration": 0.0, + "text": "to do uh long context unless they're" + }, + { + "start": 4497.6, + "duration": 0.0, + "text": "to do uh long context unless they're doing<01:14:57.840> exotic<01:14:58.760> uh<01:14:58.840> SSM<01:14:59.280> stuff." + }, + { + "start": 4501.72, + "duration": 0.0, + "text": "So,<01:15:01.880> I'll<01:15:02.000> start<01:15:02.360> with<01:15:02.720> um<01:15:03.080> group<01:15:03.400> query" + }, + { + "start": 4503.59, + "duration": 0.0, + "text": "So, I'll start with um group query" + }, + { + "start": 4503.6, + "duration": 0.0, + "text": "So, I'll start with um group query attention<01:15:03.920> or<01:15:04.000> GQA<01:15:04.480> or<01:15:04.600> MQA.<01:15:05.720> Um" + }, + { + "start": 4506.39, + "duration": 0.0, + "text": "attention or GQA or MQA. Um" + }, + { + "start": 4506.4, + "duration": 0.0, + "text": "attention or GQA or MQA. Um this<01:15:06.880> I'm<01:15:06.960> going<01:15:07.080> to<01:15:07.640> first<01:15:07.960> set<01:15:08.120> up<01:15:08.280> the<01:15:08.400> need" + }, + { + "start": 4508.99, + "duration": 0.0, + "text": "this I'm going to first set up the need" + }, + { + "start": 4509.0, + "duration": 0.0, + "text": "this I'm going to first set up the need for<01:15:09.160> these<01:15:09.400> kinds<01:15:09.640> of<01:15:09.720> things,<01:15:10.360> and<01:15:10.520> then" + }, + { + "start": 4510.79, + "duration": 0.0, + "text": "for these kinds of things, and then" + }, + { + "start": 4510.8, + "duration": 0.0, + "text": "for these kinds of things, and then you'll<01:15:10.920> kind<01:15:11.080> of<01:15:11.400> hopefully<01:15:11.840> see<01:15:12.120> what<01:15:12.280> the" + }, + { + "start": 4512.47, + "duration": 0.0, + "text": "you'll kind of hopefully see what the" + }, + { + "start": 4512.48, + "duration": 0.0, + "text": "you'll kind of hopefully see what the what<01:15:12.600> the<01:15:12.680> trick<01:15:13.000> is<01:15:13.840> um<01:15:13.960> and<01:15:14.160> why<01:15:14.280> it's<01:15:14.480> fairly" + }, + { + "start": 4514.79, + "duration": 0.0, + "text": "what the trick is um and why it's fairly" + }, + { + "start": 4514.8, + "duration": 0.0, + "text": "what the trick is um and why it's fairly natural." + }, + { + "start": 4516.15, + "duration": 0.0, + "text": "natural." + }, + { + "start": 4516.16, + "duration": 0.0, + "text": "natural. So,<01:15:16.960> for<01:15:17.080> the<01:15:17.160> moment,<01:15:17.480> we've<01:15:17.600> been<01:15:17.760> talking" + }, + { + "start": 4518.07, + "duration": 0.0, + "text": "So, for the moment, we've been talking" + }, + { + "start": 4518.08, + "duration": 0.0, + "text": "So, for the moment, we've been talking about,<01:15:18.560> you<01:15:18.640> know,<01:15:18.720> training<01:15:19.280> and<01:15:19.400> modeling" + }, + { + "start": 4519.83, + "duration": 0.0, + "text": "about, you know, training and modeling" + }, + { + "start": 4519.84, + "duration": 0.0, + "text": "about, you know, training and modeling and<01:15:19.920> all<01:15:20.040> these<01:15:20.200> things,<01:15:20.560> but<01:15:20.680> like<01:15:20.840> let's" + }, + { + "start": 4520.99, + "duration": 0.0, + "text": "and all these things, but like let's" + }, + { + "start": 4521.0, + "duration": 0.0, + "text": "and all these things, but like let's take<01:15:21.160> a<01:15:21.240> pause,<01:15:22.160> and<01:15:22.240> now<01:15:22.400> let's<01:15:22.560> think<01:15:22.680> about" + }, + { + "start": 4523.03, + "duration": 0.0, + "text": "take a pause, and now let's think about" + }, + { + "start": 4523.04, + "duration": 0.0, + "text": "take a pause, and now let's think about deployment,<01:15:23.920> right?<01:15:24.520> You<01:15:24.920> train<01:15:25.200> this<01:15:25.320> very" + }, + { + "start": 4525.51, + "duration": 0.0, + "text": "deployment, right? You train this very" + }, + { + "start": 4525.52, + "duration": 0.0, + "text": "deployment, right? You train this very big<01:15:25.680> model,<01:15:26.480> and<01:15:26.640> now<01:15:26.840> you<01:15:26.920> need<01:15:27.040> to<01:15:27.160> serve<01:15:27.440> it" + }, + { + "start": 4527.55, + "duration": 0.0, + "text": "big model, and now you need to serve it" + }, + { + "start": 4527.56, + "duration": 0.0, + "text": "big model, and now you need to serve it to<01:15:27.640> lots<01:15:27.880> of<01:15:28.000> users,<01:15:28.880> and<01:15:29.000> you're<01:15:29.080> going<01:15:29.200> to" + }, + { + "start": 4529.27, + "duration": 0.0, + "text": "to lots of users, and you're going to" + }, + { + "start": 4529.28, + "duration": 0.0, + "text": "to lots of users, and you're going to pay<01:15:29.480> a<01:15:29.560> cost<01:15:30.120> for<01:15:30.280> serving." + }, + { + "start": 4531.43, + "duration": 0.0, + "text": "pay a cost for serving." + }, + { + "start": 4531.44, + "duration": 0.0, + "text": "pay a cost for serving. And<01:15:32.040> you're<01:15:32.240> going<01:15:32.360> to<01:15:32.440> have<01:15:32.600> to,<01:15:33.000> in<01:15:33.320> abstract" + }, + { + "start": 4533.87, + "duration": 0.0, + "text": "And you're going to have to, in abstract" + }, + { + "start": 4533.88, + "duration": 0.0, + "text": "And you're going to have to, in abstract sense,<01:15:34.200> pay<01:15:34.360> for<01:15:34.520> two<01:15:34.760> different<01:15:35.120> resources," + }, + { + "start": 4535.91, + "duration": 0.0, + "text": "sense, pay for two different resources," + }, + { + "start": 4535.92, + "duration": 0.0, + "text": "sense, pay for two different resources, right?<01:15:36.040> You're<01:15:36.120> going<01:15:36.240> to<01:15:36.280> have<01:15:36.400> to<01:15:36.520> pay<01:15:36.680> for" + }, + { + "start": 4536.79, + "duration": 0.0, + "text": "right? You're going to have to pay for" + }, + { + "start": 4536.8, + "duration": 0.0, + "text": "right? You're going to have to pay for your<01:15:36.920> flops,<01:15:37.720> right?<01:15:38.000> The<01:15:38.520> computation<01:15:39.120> that" + }, + { + "start": 4539.23, + "duration": 0.0, + "text": "your flops, right? The computation that" + }, + { + "start": 4539.24, + "duration": 0.0, + "text": "your flops, right? The computation that you're<01:15:39.320> performing,<01:15:40.320> but<01:15:40.440> you<01:15:40.520> also<01:15:40.680> have<01:15:40.800> to" + }, + { + "start": 4540.87, + "duration": 0.0, + "text": "you're performing, but you also have to" + }, + { + "start": 4540.88, + "duration": 0.0, + "text": "you're performing, but you also have to pay<01:15:41.040> for<01:15:41.160> another<01:15:41.560> thing.<01:15:41.800> You<01:15:41.840> have<01:15:41.960> to<01:15:42.040> pay" + }, + { + "start": 4542.19, + "duration": 0.0, + "text": "pay for another thing. You have to pay" + }, + { + "start": 4542.2, + "duration": 0.0, + "text": "pay for another thing. You have to pay for<01:15:42.320> your<01:15:42.440> memory<01:15:42.880> accesses,<01:15:43.760> right?<01:15:44.440> Because" + }, + { + "start": 4544.71, + "duration": 0.0, + "text": "for your memory accesses, right? Because" + }, + { + "start": 4544.72, + "duration": 0.0, + "text": "for your memory accesses, right? Because the<01:15:44.800> memory<01:15:45.160> accesses<01:15:45.640> are<01:15:45.760> also<01:15:46.120> going<01:15:46.320> to" + }, + { + "start": 4546.43, + "duration": 0.0, + "text": "the memory accesses are also going to" + }, + { + "start": 4546.44, + "duration": 0.0, + "text": "the memory accesses are also going to impact,<01:15:47.040> you<01:15:47.120> know,<01:15:47.240> your<01:15:47.440> system's" + }, + { + "start": 4547.83, + "duration": 0.0, + "text": "impact, you know, your system's" + }, + { + "start": 4547.84, + "duration": 0.0, + "text": "impact, you know, your system's characteristics,<01:15:48.520> your<01:15:48.640> latency,<01:15:49.280> your" + }, + { + "start": 4549.39, + "duration": 0.0, + "text": "characteristics, your latency, your" + }, + { + "start": 4549.4, + "duration": 0.0, + "text": "characteristics, your latency, your utilization,<01:15:50.120> right?<01:15:50.600> So,<01:15:50.720> you<01:15:50.800> want<01:15:51.040> both<01:15:51.360> of" + }, + { + "start": 4551.47, + "duration": 0.0, + "text": "utilization, right? So, you want both of" + }, + { + "start": 4551.48, + "duration": 0.0, + "text": "utilization, right? So, you want both of these<01:15:51.680> things<01:15:52.000> to<01:15:52.080> be<01:15:52.200> small." + }, + { + "start": 4553.51, + "duration": 0.0, + "text": "these things to be small." + }, + { + "start": 4553.52, + "duration": 0.0, + "text": "these things to be small. Now,<01:15:54.280> let's<01:15:54.520> think<01:15:54.680> about<01:15:54.880> what<01:15:55.000> happens" + }, + { + "start": 4555.39, + "duration": 0.0, + "text": "Now, let's think about what happens" + }, + { + "start": 4555.4, + "duration": 0.0, + "text": "Now, let's think about what happens during<01:15:55.600> training<01:15:56.240> or<01:15:56.840> alternatively<01:15:57.640> prefill" + }, + { + "start": 4558.27, + "duration": 0.0, + "text": "during training or alternatively prefill" + }, + { + "start": 4558.28, + "duration": 0.0, + "text": "during training or alternatively prefill when<01:15:58.440> you're<01:15:58.640> looking<01:15:58.880> at<01:15:58.960> your<01:15:59.080> prompt<01:15:59.520> where" + }, + { + "start": 4559.67, + "duration": 0.0, + "text": "when you're looking at your prompt where" + }, + { + "start": 4559.68, + "duration": 0.0, + "text": "when you're looking at your prompt where someone<01:15:59.960> gives<01:16:00.200> you<01:16:00.280> the<01:16:00.400> stuff.<01:16:01.280> In<01:16:01.440> this" + }, + { + "start": 4561.63, + "duration": 0.0, + "text": "someone gives you the stuff. In this" + }, + { + "start": 4561.64, + "duration": 0.0, + "text": "someone gives you the stuff. In this case,<01:16:02.360> you<01:16:02.440> know,<01:16:02.680> the<01:16:02.840> total<01:16:03.200> arithmetic" + }, + { + "start": 4563.71, + "duration": 0.0, + "text": "case, you know, the total arithmetic" + }, + { + "start": 4563.72, + "duration": 0.0, + "text": "case, you know, the total arithmetic operations<01:16:04.240> you<01:16:04.400> have<01:16:05.120> is,<01:16:05.600> you<01:16:05.680> know,<01:16:06.120> order" + }, + { + "start": 4566.31, + "duration": 0.0, + "text": "operations you have is, you know, order" + }, + { + "start": 4566.32, + "duration": 0.0, + "text": "operations you have is, you know, order of<01:16:06.400> magnitude<01:16:07.200> batch<01:16:07.600> size<01:16:08.080> sequence<01:16:08.600> length" + }, + { + "start": 4569.11, + "duration": 0.0, + "text": "of magnitude batch size sequence length" + }, + { + "start": 4569.12, + "duration": 0.0, + "text": "of magnitude batch size sequence length hidden<01:16:09.440> dim<01:16:09.640> squared,<01:16:10.120> right?<01:16:10.280> That's" + }, + { + "start": 4570.51, + "duration": 0.0, + "text": "hidden dim squared, right? That's" + }, + { + "start": 4570.52, + "duration": 0.0, + "text": "hidden dim squared, right? That's roughly<01:16:11.000> the<01:16:11.120> size<01:16:11.440> of<01:16:11.560> things<01:16:11.840> that<01:16:12.000> you<01:16:12.120> get." + }, + { + "start": 4572.59, + "duration": 0.0, + "text": "roughly the size of things that you get." + }, + { + "start": 4572.6, + "duration": 0.0, + "text": "roughly the size of things that you get. And<01:16:12.680> of<01:16:12.760> course,<01:16:13.120> you<01:16:13.240> know,<01:16:13.320> we're<01:16:13.440> doing" + }, + { + "start": 4573.59, + "duration": 0.0, + "text": "And of course, you know, we're doing" + }, + { + "start": 4573.6, + "duration": 0.0, + "text": "And of course, you know, we're doing quadratic<01:16:14.000> attention,<01:16:14.360> so<01:16:14.440> we've<01:16:14.600> got<01:16:14.800> D" + }, + { + "start": 4574.95, + "duration": 0.0, + "text": "quadratic attention, so we've got D" + }, + { + "start": 4574.96, + "duration": 0.0, + "text": "quadratic attention, so we've got D squared." + }, + { + "start": 4576.39, + "duration": 0.0, + "text": "squared." + }, + { + "start": 4576.4, + "duration": 0.0, + "text": "squared. Um" + }, + { + "start": 4577.59, + "duration": 0.0, + "text": "Um" + }, + { + "start": 4577.6, + "duration": 0.0, + "text": "Um we've<01:16:17.800> got<01:16:18.240> uh" + }, + { + "start": 4578.83, + "duration": 0.0, + "text": "we've got uh" + }, + { + "start": 4578.84, + "duration": 0.0, + "text": "we've got uh total<01:16:19.200> memory<01:16:19.600> accesses.<01:16:20.160> Like<01:16:20.320> what<01:16:20.440> is<01:16:20.520> our" + }, + { + "start": 4580.63, + "duration": 0.0, + "text": "total memory accesses. Like what is our" + }, + { + "start": 4580.64, + "duration": 0.0, + "text": "total memory accesses. Like what is our memory<01:16:20.960> access<01:16:21.320> that<01:16:21.520> we<01:16:21.640> have<01:16:21.880> here?<01:16:22.600> Um<01:16:22.680> we" + }, + { + "start": 4582.75, + "duration": 0.0, + "text": "memory access that we have here? Um we" + }, + { + "start": 4582.76, + "duration": 0.0, + "text": "memory access that we have here? Um we have<01:16:23.000> batch<01:16:23.400> times<01:16:23.600> sequence<01:16:23.920> length<01:16:24.080> times" + }, + { + "start": 4584.71, + "duration": 0.0, + "text": "have batch times sequence length times" + }, + { + "start": 4584.72, + "duration": 0.0, + "text": "have batch times sequence length times uh<01:16:24.800> hidden<01:16:25.080> dim<01:16:25.640> plus<01:16:26.560> um" + }, + { + "start": 4587.11, + "duration": 0.0, + "text": "uh hidden dim plus um" + }, + { + "start": 4587.12, + "duration": 0.0, + "text": "uh hidden dim plus um the<01:16:27.240> sort<01:16:27.400> of<01:16:27.520> cost<01:16:27.880> of<01:16:27.960> the<01:16:28.040> soft<01:16:28.400> max,<01:16:28.680> which" + }, + { + "start": 4588.79, + "duration": 0.0, + "text": "the sort of cost of the soft max, which" + }, + { + "start": 4588.8, + "duration": 0.0, + "text": "the sort of cost of the soft max, which has<01:16:28.920> a<01:16:29.000> N<01:16:29.160> squared<01:16:29.440> component,<01:16:30.160> and<01:16:30.240> then" + }, + { + "start": 4590.31, + "duration": 0.0, + "text": "has a N squared component, and then" + }, + { + "start": 4590.32, + "duration": 0.0, + "text": "has a N squared component, and then we've<01:16:30.440> got<01:16:30.600> a<01:16:30.680> D<01:16:30.960> squared<01:16:31.240> component<01:16:32.280> um<01:16:32.640> for" + }, + { + "start": 4592.79, + "duration": 0.0, + "text": "we've got a D squared component um for" + }, + { + "start": 4592.8, + "duration": 0.0, + "text": "we've got a D squared component um for the<01:16:33.160> for<01:16:33.280> the<01:16:33.360> projections.<01:16:34.400> So,<01:16:34.480> the" + }, + { + "start": 4594.55, + "duration": 0.0, + "text": "the for the projections. So, the" + }, + { + "start": 4594.56, + "duration": 0.0, + "text": "the for the projections. So, the arithmetic<01:16:34.960> intensity<01:16:35.360> here<01:16:35.520> is<01:16:35.640> pretty" + }, + { + "start": 4595.99, + "duration": 0.0, + "text": "arithmetic intensity here is pretty" + }, + { + "start": 4596.0, + "duration": 0.0, + "text": "arithmetic intensity here is pretty good.<01:16:36.920> Um" + }, + { + "start": 4597.47, + "duration": 0.0, + "text": "good. Um" + }, + { + "start": 4597.48, + "duration": 0.0, + "text": "good. Um it's<01:16:37.600> going<01:16:37.720> to<01:16:37.800> be<01:16:37.920> one<01:16:38.160> over<01:16:38.400> K.<01:16:38.680> This<01:16:38.840> is<01:16:38.920> the" + }, + { + "start": 4598.99, + "duration": 0.0, + "text": "it's going to be one over K. This is the" + }, + { + "start": 4599.0, + "duration": 0.0, + "text": "it's going to be one over K. This is the number<01:16:39.240> of<01:16:39.360> heads,<01:16:39.640> so<01:16:39.760> you<01:16:39.800> need<01:16:40.000> to<01:16:40.080> have<01:16:40.880> um" + }, + { + "start": 4601.27, + "duration": 0.0, + "text": "number of heads, so you need to have um" + }, + { + "start": 4601.28, + "duration": 0.0, + "text": "number of heads, so you need to have um uh<01:16:41.320> sorry,<01:16:41.600> head<01:16:41.800> dims.<01:16:42.120> So,<01:16:42.240> your<01:16:42.360> head<01:16:42.600> dims" + }, + { + "start": 4602.83, + "duration": 0.0, + "text": "uh sorry, head dims. So, your head dims" + }, + { + "start": 4602.84, + "duration": 0.0, + "text": "uh sorry, head dims. So, your head dims need<01:16:43.000> to<01:16:43.040> be<01:16:43.160> big<01:16:43.320> enough<01:16:43.520> that<01:16:43.640> you're" + }, + { + "start": 4603.75, + "duration": 0.0, + "text": "need to be big enough that you're" + }, + { + "start": 4603.76, + "duration": 0.0, + "text": "need to be big enough that you're multiplying<01:16:44.280> some<01:16:44.440> reasonably<01:16:44.880> sized" + }, + { + "start": 4605.11, + "duration": 0.0, + "text": "multiplying some reasonably sized" + }, + { + "start": 4605.12, + "duration": 0.0, + "text": "multiplying some reasonably sized matrices.<01:16:46.000> And<01:16:46.080> you've<01:16:46.200> got<01:16:46.320> a<01:16:46.400> one<01:16:46.680> over<01:16:47.000> BN," + }, + { + "start": 4607.87, + "duration": 0.0, + "text": "matrices. And you've got a one over BN," + }, + { + "start": 4607.88, + "duration": 0.0, + "text": "matrices. And you've got a one over BN, so<01:16:47.960> your<01:16:48.080> sequences<01:16:48.680> need<01:16:48.840> to<01:16:48.880> be<01:16:49.000> long<01:16:49.240> enough" + }, + { + "start": 4609.51, + "duration": 0.0, + "text": "so your sequences need to be long enough" + }, + { + "start": 4609.52, + "duration": 0.0, + "text": "so your sequences need to be long enough or<01:16:49.760> your<01:16:49.880> batch<01:16:50.160> sizes<01:16:50.520> need<01:16:50.680> to<01:16:50.760> be<01:16:50.840> big" + }, + { + "start": 4611.03, + "duration": 0.0, + "text": "or your batch sizes need to be big" + }, + { + "start": 4611.04, + "duration": 0.0, + "text": "or your batch sizes need to be big enough.<01:16:51.800> As<01:16:51.960> long<01:16:52.160> as<01:16:52.320> both<01:16:52.600> of<01:16:52.680> these<01:16:52.840> are" + }, + { + "start": 4612.95, + "duration": 0.0, + "text": "enough. As long as both of these are" + }, + { + "start": 4612.96, + "duration": 0.0, + "text": "enough. As long as both of these are true,<01:16:53.640> your<01:16:53.800> GPUs<01:16:54.240> are<01:16:54.320> going<01:16:54.440> to<01:16:54.520> be<01:16:54.640> fully" + }, + { + "start": 4614.91, + "duration": 0.0, + "text": "true, your GPUs are going to be fully" + }, + { + "start": 4614.92, + "duration": 0.0, + "text": "true, your GPUs are going to be fully utilized.<01:16:55.480> Great,<01:16:55.840> right?<01:16:56.000> You're<01:16:56.440> You're" + }, + { + "start": 4616.51, + "duration": 0.0, + "text": "utilized. Great, right? You're You're" + }, + { + "start": 4616.52, + "duration": 0.0, + "text": "utilized. Great, right? You're You're using<01:16:56.800> all<01:16:56.920> of<01:16:57.000> your<01:16:57.120> resources." + }, + { + "start": 4619.47, + "duration": 0.0, + "text": "using all of your resources." + }, + { + "start": 4619.48, + "duration": 0.0, + "text": "using all of your resources. Now,<01:17:00.200> you<01:17:00.320> know,<01:17:00.880> we<01:17:01.000> have<01:17:01.160> done<01:17:01.560> we<01:17:01.640> have" + }, + { + "start": 4621.75, + "duration": 0.0, + "text": "Now, you know, we have done we have" + }, + { + "start": 4621.76, + "duration": 0.0, + "text": "Now, you know, we have done we have finished<01:17:02.000> training,<01:17:02.480> and<01:17:02.560> now<01:17:02.680> we're<01:17:02.800> serving" + }, + { + "start": 4623.15, + "duration": 0.0, + "text": "finished training, and now we're serving" + }, + { + "start": 4623.16, + "duration": 0.0, + "text": "finished training, and now we're serving our<01:17:03.280> users.<01:17:03.920> How<01:17:04.040> do<01:17:04.120> we<01:17:04.240> serve<01:17:04.480> our<01:17:04.600> users?" + }, + { + "start": 4624.95, + "duration": 0.0, + "text": "our users. How do we serve our users?" + }, + { + "start": 4624.96, + "duration": 0.0, + "text": "our users. How do we serve our users? We're<01:17:05.080> going<01:17:05.200> to<01:17:05.240> generate<01:17:05.680> tokens<01:17:06.080> and<01:17:06.200> send" + }, + { + "start": 4626.43, + "duration": 0.0, + "text": "We're going to generate tokens and send" + }, + { + "start": 4626.44, + "duration": 0.0, + "text": "We're going to generate tokens and send it<01:17:06.560> to<01:17:06.640> them,<01:17:06.880> right?<01:17:07.640> Now,<01:17:07.880> for<01:17:08.040> doing<01:17:08.440> that," + }, + { + "start": 4629.15, + "duration": 0.0, + "text": "it to them, right? Now, for doing that," + }, + { + "start": 4629.16, + "duration": 0.0, + "text": "it to them, right? Now, for doing that, um<01:17:09.320> I<01:17:09.400> can't<01:17:10.000> parallelize<01:17:10.560> the<01:17:10.680> generation" + }, + { + "start": 4631.15, + "duration": 0.0, + "text": "um I can't parallelize the generation" + }, + { + "start": 4631.16, + "duration": 0.0, + "text": "um I can't parallelize the generation process.<01:17:11.920> What<01:17:12.040> I'm<01:17:12.120> going<01:17:12.240> to<01:17:12.280> do<01:17:12.400> is<01:17:12.480> I'm" + }, + { + "start": 4632.55, + "duration": 0.0, + "text": "process. What I'm going to do is I'm" + }, + { + "start": 4632.56, + "duration": 0.0, + "text": "process. What I'm going to do is I'm going<01:17:12.680> to<01:17:12.720> generate<01:17:13.120> a<01:17:13.160> token,<01:17:13.640> I'm<01:17:13.680> going<01:17:13.800> to" + }, + { + "start": 4633.87, + "duration": 0.0, + "text": "going to generate a token, I'm going to" + }, + { + "start": 4633.88, + "duration": 0.0, + "text": "going to generate a token, I'm going to condition<01:17:14.320> on<01:17:14.440> it,<01:17:14.520> I'm<01:17:14.560> going<01:17:14.680> to<01:17:14.760> generate" + }, + { + "start": 4635.03, + "duration": 0.0, + "text": "condition on it, I'm going to generate" + }, + { + "start": 4635.04, + "duration": 0.0, + "text": "condition on it, I'm going to generate the<01:17:15.120> next<01:17:15.400> token,<01:17:16.000> and<01:17:16.080> I'm<01:17:16.120> going<01:17:16.240> to<01:17:16.320> repeat" + }, + { + "start": 4636.63, + "duration": 0.0, + "text": "the next token, and I'm going to repeat" + }, + { + "start": 4636.64, + "duration": 0.0, + "text": "the next token, and I'm going to repeat this<01:17:16.800> process<01:17:17.200> one<01:17:17.360> by<01:17:17.520> one,<01:17:17.720> right?<01:17:17.840> This<01:17:17.960> is" + }, + { + "start": 4638.07, + "duration": 0.0, + "text": "this process one by one, right? This is" + }, + { + "start": 4638.08, + "duration": 0.0, + "text": "this process one by one, right? This is just<01:17:18.280> sort<01:17:18.400> of<01:17:18.480> the<01:17:18.880> the<01:17:19.000> curse<01:17:19.440> of" + }, + { + "start": 4639.55, + "duration": 0.0, + "text": "just sort of the the curse of" + }, + { + "start": 4639.56, + "duration": 0.0, + "text": "just sort of the the curse of autoregressive<01:17:20.240> language<01:17:20.520> modeling.<01:17:20.880> We" + }, + { + "start": 4640.99, + "duration": 0.0, + "text": "autoregressive language modeling. We" + }, + { + "start": 4641.0, + "duration": 0.0, + "text": "autoregressive language modeling. We have<01:17:21.240> to<01:17:21.360> do<01:17:21.520> this." + }, + { + "start": 4642.43, + "duration": 0.0, + "text": "have to do this." + }, + { + "start": 4642.44, + "duration": 0.0, + "text": "have to do this. Um<01:17:23.280> in<01:17:23.440> order<01:17:23.640> to<01:17:23.760> do<01:17:23.920> this,<01:17:24.280> the<01:17:24.440> efficient" + }, + { + "start": 4644.87, + "duration": 0.0, + "text": "Um in order to do this, the efficient" + }, + { + "start": 4644.88, + "duration": 0.0, + "text": "Um in order to do this, the efficient way<01:17:24.960> to<01:17:25.120> do<01:17:25.320> it<01:17:25.560> is<01:17:25.760> to<01:17:25.960> maintain<01:17:27.200> all<01:17:27.480> of<01:17:27.560> the" + }, + { + "start": 4647.75, + "duration": 0.0, + "text": "way to do it is to maintain all of the" + }, + { + "start": 4647.76, + "duration": 0.0, + "text": "way to do it is to maintain all of the sort<01:17:27.920> of<01:17:28.080> past<01:17:28.720> keys<01:17:29.040> and<01:17:29.160> queries<01:17:29.520> that<01:17:29.640> I've" + }, + { + "start": 4649.79, + "duration": 0.0, + "text": "sort of past keys and queries that I've" + }, + { + "start": 4649.8, + "duration": 0.0, + "text": "sort of past keys and queries that I've had<01:17:30.560> in<01:17:30.720> what's<01:17:30.880> called<01:17:31.080> a<01:17:31.120> KV" + }, + { + "start": 4656.88, + "duration": 0.0, + "text": "over<01:17:37.120> the<01:17:37.240> past,<01:17:38.000> and<01:17:38.160> then<01:17:38.400> whenever<01:17:38.720> I<01:17:38.760> need" + }, + { + "start": 4658.91, + "duration": 0.0, + "text": "over the past, and then whenever I need" + }, + { + "start": 4658.92, + "duration": 0.0, + "text": "over the past, and then whenever I need to<01:17:39.000> compute<01:17:39.320> something<01:17:39.640> new,<01:17:40.240> I<01:17:40.320> can<01:17:40.600> reuse" + }, + { + "start": 4661.15, + "duration": 0.0, + "text": "to compute something new, I can reuse" + }, + { + "start": 4661.16, + "duration": 0.0, + "text": "to compute something new, I can reuse sort<01:17:41.320> of<01:17:41.400> the<01:17:41.760> the<01:17:41.880> submatrices<01:17:43.160> that<01:17:43.320> I've" + }, + { + "start": 4663.47, + "duration": 0.0, + "text": "sort of the the submatrices that I've" + }, + { + "start": 4663.48, + "duration": 0.0, + "text": "sort of the the submatrices that I've already<01:17:43.800> had<01:17:44.040> from<01:17:44.160> the<01:17:44.280> past.<01:17:45.000> And<01:17:45.120> I<01:17:45.240> only" + }, + { + "start": 4665.59, + "duration": 0.0, + "text": "already had from the past. And I only" + }, + { + "start": 4665.6, + "duration": 0.0, + "text": "already had from the past. And I only really<01:17:45.840> need<01:17:46.000> to<01:17:46.080> compute<01:17:46.880> sort<01:17:47.040> of<01:17:47.120> the<01:17:47.240> new" + }, + { + "start": 4668.39, + "duration": 0.0, + "text": "really need to compute sort of the new" + }, + { + "start": 4668.4, + "duration": 0.0, + "text": "really need to compute sort of the new um<01:17:48.760> query<01:17:49.240> key<01:17:49.880> interactions<01:17:50.880> that<01:17:51.000> I<01:17:51.080> need<01:17:51.280> to" + }, + { + "start": 4671.39, + "duration": 0.0, + "text": "um query key interactions that I need to" + }, + { + "start": 4671.4, + "duration": 0.0, + "text": "um query key interactions that I need to fill<01:17:51.720> out<01:17:52.240> the<01:17:52.320> rest<01:17:52.960> of<01:17:53.080> this<01:17:53.280> matrix,<01:17:53.920> right?" + }, + { + "start": 4674.47, + "duration": 0.0, + "text": "fill out the rest of this matrix, right?" + }, + { + "start": 4674.48, + "duration": 0.0, + "text": "fill out the rest of this matrix, right? So,<01:17:54.920> every<01:17:55.160> submatrix<01:17:55.800> I've<01:17:55.920> computed" + }, + { + "start": 4676.31, + "duration": 0.0, + "text": "So, every submatrix I've computed" + }, + { + "start": 4676.32, + "duration": 0.0, + "text": "So, every submatrix I've computed before,<01:17:56.640> I<01:17:56.720> can<01:17:56.880> keep.<01:17:57.480> I<01:17:57.600> only<01:17:57.960> need<01:17:58.160> to" + }, + { + "start": 4678.23, + "duration": 0.0, + "text": "before, I can keep. I only need to" + }, + { + "start": 4678.24, + "duration": 0.0, + "text": "before, I can keep. I only need to compute<01:17:58.600> my<01:17:58.719> new<01:17:58.920> ones.<01:17:59.080> So,<01:17:59.160> this<01:17:59.360> saves<01:17:59.640> a" + }, + { + "start": 4679.709, + "duration": 0.0, + "text": "compute my new ones. So, this saves a" + }, + { + "start": 4679.719, + "duration": 0.0, + "text": "compute my new ones. So, this saves a lot<01:17:59.960> on<01:18:00.080> compute,<01:18:00.480> right?<01:18:00.600> That's<01:18:00.800> great." + }, + { + "start": 4682.43, + "duration": 0.0, + "text": "lot on compute, right? That's great." + }, + { + "start": 4682.44, + "duration": 0.0, + "text": "lot on compute, right? That's great. But," + }, + { + "start": 4683.83, + "duration": 0.0, + "text": "But," + }, + { + "start": 4683.84, + "duration": 0.0, + "text": "But, the<01:18:04.040> issue<01:18:04.320> here<01:18:04.960> is<01:18:05.320> now<01:18:05.640> our<01:18:05.760> arithmetic" + }, + { + "start": 4686.19, + "duration": 0.0, + "text": "the issue here is now our arithmetic" + }, + { + "start": 4686.2, + "duration": 0.0, + "text": "the issue here is now our arithmetic intensity<01:18:06.680> is<01:18:06.800> not<01:18:07.240> so<01:18:07.480> good,<01:18:07.800> right?<01:18:08.480> As<01:18:08.680> you" + }, + { + "start": 4688.75, + "duration": 0.0, + "text": "intensity is not so good, right? As you" + }, + { + "start": 4688.76, + "duration": 0.0, + "text": "intensity is not so good, right? As you might<01:18:08.920> sort<01:18:09.080> of<01:18:09.160> intuit,<01:18:09.960> this<01:18:10.160> KV<01:18:10.680> cache" + }, + { + "start": 4690.95, + "duration": 0.0, + "text": "might sort of intuit, this KV cache" + }, + { + "start": 4690.96, + "duration": 0.0, + "text": "might sort of intuit, this KV cache approach<01:18:11.800> is<01:18:11.920> going<01:18:12.080> to<01:18:12.160> be<01:18:12.680> reading<01:18:13.240> and<01:18:13.400> and" + }, + { + "start": 4693.75, + "duration": 0.0, + "text": "approach is going to be reading and and" + }, + { + "start": 4693.76, + "duration": 0.0, + "text": "approach is going to be reading and and reading<01:18:14.560> um<01:18:14.920> parameters<01:18:15.480> all<01:18:15.680> the<01:18:15.800> time," + }, + { + "start": 4696.19, + "duration": 0.0, + "text": "reading um parameters all the time," + }, + { + "start": 4696.2, + "duration": 0.0, + "text": "reading um parameters all the time, right?<01:18:16.440> Each<01:18:16.680> time<01:18:16.920> I<01:18:16.960> have<01:18:17.120> a<01:18:17.160> new<01:18:17.360> step," + }, + { + "start": 4698.11, + "duration": 0.0, + "text": "right? Each time I have a new step," + }, + { + "start": 4698.12, + "duration": 0.0, + "text": "right? Each time I have a new step, right?<01:18:18.320> I'm<01:18:18.400> going<01:18:18.520> to<01:18:18.560> have<01:18:18.719> to<01:18:18.880> read<01:18:19.160> in<01:18:19.280> my" + }, + { + "start": 4699.43, + "duration": 0.0, + "text": "right? I'm going to have to read in my" + }, + { + "start": 4699.44, + "duration": 0.0, + "text": "right? I'm going to have to read in my parameters.<01:18:20.000> I'm<01:18:20.080> going<01:18:20.200> to<01:18:20.280> have<01:18:20.400> to<01:18:20.440> take" + }, + { + "start": 4700.63, + "duration": 0.0, + "text": "parameters. I'm going to have to take" + }, + { + "start": 4700.64, + "duration": 0.0, + "text": "parameters. I'm going to have to take these<01:18:20.840> dot<01:18:21.080> products,<01:18:21.800> and<01:18:21.880> I'm<01:18:21.920> going<01:18:22.040> to<01:18:22.120> do" + }, + { + "start": 4702.35, + "duration": 0.0, + "text": "these dot products, and I'm going to do" + }, + { + "start": 4702.36, + "duration": 0.0, + "text": "these dot products, and I'm going to do this<01:18:22.600> once<01:18:22.960> every<01:18:23.320> step." + }, + { + "start": 4704.51, + "duration": 0.0, + "text": "this once every step." + }, + { + "start": 4704.52, + "duration": 0.0, + "text": "this once every step. And<01:18:24.719> so,<01:18:24.840> now<01:18:25.200> what<01:18:25.360> do<01:18:25.480> I<01:18:25.560> have?<01:18:25.880> Well,<01:18:26.560> you" + }, + { + "start": 4706.67, + "duration": 0.0, + "text": "And so, now what do I have? Well, you" + }, + { + "start": 4706.68, + "duration": 0.0, + "text": "And so, now what do I have? Well, you know,<01:18:26.800> my<01:18:26.960> total<01:18:27.320> memory<01:18:27.880> Oh,<01:18:27.920> sorry.<01:18:28.360> My" + }, + { + "start": 4708.51, + "duration": 0.0, + "text": "know, my total memory Oh, sorry. My" + }, + { + "start": 4708.52, + "duration": 0.0, + "text": "know, my total memory Oh, sorry. My total<01:18:28.840> arithmetic<01:18:29.320> operations<01:18:29.800> are<01:18:29.880> the" + }, + { + "start": 4709.99, + "duration": 0.0, + "text": "total arithmetic operations are the" + }, + { + "start": 4710.0, + "duration": 0.0, + "text": "total arithmetic operations are the same.<01:18:30.360> I'm<01:18:30.480> multiplying<01:18:31.000> the<01:18:31.080> same<01:18:31.400> matrices" + }, + { + "start": 4711.99, + "duration": 0.0, + "text": "same. I'm multiplying the same matrices" + }, + { + "start": 4712.0, + "duration": 0.0, + "text": "same. I'm multiplying the same matrices still,<01:18:32.360> right?<01:18:32.560> Just<01:18:32.800> incrementally<01:18:33.480> rather" + }, + { + "start": 4713.709, + "duration": 0.0, + "text": "still, right? Just incrementally rather" + }, + { + "start": 4713.719, + "duration": 0.0, + "text": "still, right? Just incrementally rather than<01:18:33.880> all<01:18:34.040> at<01:18:34.200> once.<01:18:35.120> But<01:18:35.320> because<01:18:35.719> I'm<01:18:35.840> doing" + }, + { + "start": 4716.11, + "duration": 0.0, + "text": "than all at once. But because I'm doing" + }, + { + "start": 4716.12, + "duration": 0.0, + "text": "than all at once. But because I'm doing this<01:18:36.280> incrementally,<01:18:37.520> you<01:18:37.640> know,<01:18:37.920> now<01:18:38.240> I<01:18:38.320> have" + }, + { + "start": 4719.19, + "duration": 0.0, + "text": "this incrementally, you know, now I have" + }, + { + "start": 4719.2, + "duration": 0.0, + "text": "this incrementally, you know, now I have um<01:18:39.719> a<01:18:39.840> a<01:18:39.880> memory<01:18:40.240> access<01:18:40.600> pattern<01:18:41.040> of<01:18:41.360> batch<01:18:42.040> by" + }, + { + "start": 4722.19, + "duration": 0.0, + "text": "um a a memory access pattern of batch by" + }, + { + "start": 4722.2, + "duration": 0.0, + "text": "um a a memory access pattern of batch by sequence<01:18:42.680> squared<01:18:43.440> by<01:18:43.640> hidden<01:18:43.960> dim<01:18:44.240> plus<01:18:45.320> um" + }, + { + "start": 4725.47, + "duration": 0.0, + "text": "sequence squared by hidden dim plus um" + }, + { + "start": 4725.48, + "duration": 0.0, + "text": "sequence squared by hidden dim plus um sequence<01:18:46.000> by<01:18:46.680> hidden<01:18:46.920> dim<01:18:47.080> squared.<01:18:47.360> And<01:18:47.440> the" + }, + { + "start": 4727.51, + "duration": 0.0, + "text": "sequence by hidden dim squared. And the" + }, + { + "start": 4727.52, + "duration": 0.0, + "text": "sequence by hidden dim squared. And the second<01:18:47.960> term<01:18:48.760> is<01:18:48.960> not<01:18:49.400> so<01:18:49.600> pleasant,<01:18:50.120> right?" + }, + { + "start": 4730.35, + "duration": 0.0, + "text": "second term is not so pleasant, right?" + }, + { + "start": 4730.36, + "duration": 0.0, + "text": "second term is not so pleasant, right? It<01:18:50.520> used<01:18:50.760> to<01:18:50.880> be<01:18:51.680> that<01:18:52.560> it<01:18:52.680> was<01:18:52.920> just<01:18:53.200> D" + }, + { + "start": 4733.39, + "duration": 0.0, + "text": "It used to be that it was just D" + }, + { + "start": 4733.4, + "duration": 0.0, + "text": "It used to be that it was just D squared,<01:18:54.280> but<01:18:54.440> now<01:18:54.719> we've<01:18:54.960> got<01:18:55.840> N<01:18:56.160> times<01:18:56.520> D" + }, + { + "start": 4736.67, + "duration": 0.0, + "text": "squared, but now we've got N times D" + }, + { + "start": 4736.68, + "duration": 0.0, + "text": "squared, but now we've got N times D squared.<01:18:57.120> And<01:18:57.240> if<01:18:57.360> we<01:18:57.480> compute<01:18:57.880> the" + }, + { + "start": 4737.99, + "duration": 0.0, + "text": "squared. And if we compute the" + }, + { + "start": 4738.0, + "duration": 0.0, + "text": "squared. And if we compute the arithmetic<01:18:58.400> intensity,<01:18:58.800> which<01:18:58.960> is<01:18:59.040> the<01:18:59.160> ratio" + }, + { + "start": 4739.63, + "duration": 0.0, + "text": "arithmetic intensity, which is the ratio" + }, + { + "start": 4739.64, + "duration": 0.0, + "text": "arithmetic intensity, which is the ratio of<01:18:59.760> these<01:18:59.960> two<01:19:00.120> guys,<01:19:01.000> um<01:19:01.760> now<01:19:02.080> we<01:19:02.240> have<01:19:02.840> N<01:19:03.120> over" + }, + { + "start": 4743.39, + "duration": 0.0, + "text": "of these two guys, um now we have N over" + }, + { + "start": 4743.4, + "duration": 0.0, + "text": "of these two guys, um now we have N over D<01:19:03.640> plus<01:19:03.880> one<01:19:04.080> over<01:19:04.320> B.<01:19:05.000> So,<01:19:05.160> now<01:19:05.400> what<01:19:05.560> we<01:19:05.680> need" + }, + { + "start": 4746.23, + "duration": 0.0, + "text": "D plus one over B. So, now what we need" + }, + { + "start": 4746.24, + "duration": 0.0, + "text": "D plus one over B. So, now what we need is<01:19:06.440> large<01:19:06.800> batches<01:19:07.280> plus<01:19:07.560> short<01:19:07.800> sequence" + }, + { + "start": 4748.23, + "duration": 0.0, + "text": "is large batches plus short sequence" + }, + { + "start": 4748.24, + "duration": 0.0, + "text": "is large batches plus short sequence length,<01:19:08.840> or<01:19:09.120> we<01:19:09.240> need<01:19:09.480> really<01:19:09.800> big<01:19:10.040> model" + }, + { + "start": 4750.35, + "duration": 0.0, + "text": "length, or we need really big model" + }, + { + "start": 4750.36, + "duration": 0.0, + "text": "length, or we need really big model dimensions.<01:19:11.080> So,<01:19:11.200> if<01:19:11.280> we<01:19:11.360> want<01:19:11.520> to<01:19:11.600> serve<01:19:11.880> a" + }, + { + "start": 4752.15, + "duration": 0.0, + "text": "dimensions. So, if we want to serve a" + }, + { + "start": 4752.16, + "duration": 0.0, + "text": "dimensions. So, if we want to serve a small<01:19:12.680> model<01:19:13.000> efficiently,<01:19:13.960> this<01:19:14.160> is<01:19:14.320> not<01:19:14.800> so" + }, + { + "start": 4755.03, + "duration": 0.0, + "text": "small model efficiently, this is not so" + }, + { + "start": 4755.04, + "duration": 0.0, + "text": "small model efficiently, this is not so good." + }, + { + "start": 4756.31, + "duration": 0.0, + "text": "good." + }, + { + "start": 4756.32, + "duration": 0.0, + "text": "good. Right?<01:19:17.160> Um<01:19:17.600> this<01:19:17.760> is<01:19:17.920> really<01:19:18.240> difficult<01:19:18.680> to" + }, + { + "start": 4758.79, + "duration": 0.0, + "text": "Right? Um this is really difficult to" + }, + { + "start": 4758.8, + "duration": 0.0, + "text": "Right? Um this is really difficult to deal<01:19:19.080> with,<01:19:19.360> right?<01:19:19.480> This<01:19:19.719> N<01:19:19.960> over<01:19:20.200> D<01:19:20.440> term," + }, + { + "start": 4760.67, + "duration": 0.0, + "text": "deal with, right? This N over D term," + }, + { + "start": 4760.68, + "duration": 0.0, + "text": "deal with, right? This N over D term, this<01:19:20.840> first<01:19:21.160> term<01:19:21.400> over<01:19:21.680> here,<01:19:22.360> which<01:19:22.520> is" + }, + { + "start": 4762.63, + "duration": 0.0, + "text": "this first term over here, which is" + }, + { + "start": 4762.64, + "duration": 0.0, + "text": "this first term over here, which is sequence<01:19:23.160> length<01:19:23.760> over<01:19:24.040> hidden<01:19:24.360> dim,<01:19:24.920> is<01:19:25.120> very" + }, + { + "start": 4765.35, + "duration": 0.0, + "text": "sequence length over hidden dim, is very" + }, + { + "start": 4765.36, + "duration": 0.0, + "text": "sequence length over hidden dim, is very difficult<01:19:25.840> to<01:19:25.960> reduce<01:19:26.360> if<01:19:26.480> we're<01:19:26.600> doing<01:19:26.840> this" + }, + { + "start": 4766.99, + "duration": 0.0, + "text": "difficult to reduce if we're doing this" + }, + { + "start": 4767.0, + "duration": 0.0, + "text": "difficult to reduce if we're doing this incremental<01:19:27.600> computation.<01:19:28.880> This<01:19:29.040> is<01:19:29.120> just<01:19:29.320> a" + }, + { + "start": 4769.35, + "duration": 0.0, + "text": "incremental computation. This is just a" + }, + { + "start": 4769.36, + "duration": 0.0, + "text": "incremental computation. This is just a hard<01:19:29.640> thing<01:19:29.800> to<01:19:29.920> deal<01:19:30.120> with." + }, + { + "start": 4772.44, + "duration": 0.0, + "text": "So,<01:19:33.040> this<01:19:33.440> leads<01:19:33.680> to<01:19:33.760> this<01:19:33.960> idea<01:19:34.280> of<01:19:34.440> MQA<01:19:35.160> or" + }, + { + "start": 4775.27, + "duration": 0.0, + "text": "So, this leads to this idea of MQA or" + }, + { + "start": 4775.28, + "duration": 0.0, + "text": "So, this leads to this idea of MQA or multi-query<01:19:35.920> attention.<01:19:36.840> Normally,<01:19:37.440> you" + }, + { + "start": 4777.55, + "duration": 0.0, + "text": "multi-query attention. Normally, you" + }, + { + "start": 4777.56, + "duration": 0.0, + "text": "multi-query attention. Normally, you have<01:19:37.800> multiple<01:19:38.280> heads<01:19:38.600> in<01:19:38.680> your<01:19:38.840> attention" + }, + { + "start": 4779.71, + "duration": 0.0, + "text": "have multiple heads in your attention" + }, + { + "start": 4779.72, + "duration": 0.0, + "text": "have multiple heads in your attention operation,<01:19:40.720> and<01:19:40.840> you're<01:19:40.920> going<01:19:41.040> to<01:19:41.080> have" + }, + { + "start": 4781.31, + "duration": 0.0, + "text": "operation, and you're going to have" + }, + { + "start": 4781.32, + "duration": 0.0, + "text": "operation, and you're going to have different<01:19:41.720> keys,<01:19:42.080> different<01:19:42.440> values,<01:19:43.160> and" + }, + { + "start": 4783.27, + "duration": 0.0, + "text": "different keys, different values, and" + }, + { + "start": 4783.28, + "duration": 0.0, + "text": "different keys, different values, and different<01:19:43.640> queries.<01:19:44.040> That's<01:19:44.240> normally<01:19:44.720> how" + }, + { + "start": 4784.87, + "duration": 0.0, + "text": "different queries. That's normally how" + }, + { + "start": 4784.88, + "duration": 0.0, + "text": "different queries. That's normally how you<01:19:45.000> do<01:19:45.200> things." + }, + { + "start": 4786.15, + "duration": 0.0, + "text": "you do things." + }, + { + "start": 4786.16, + "duration": 0.0, + "text": "you do things. But,<01:19:46.880> one<01:19:47.080> thing<01:19:47.240> that<01:19:47.360> we<01:19:47.480> could<01:19:47.640> do<01:19:48.200> is<01:19:48.360> maybe" + }, + { + "start": 4788.63, + "duration": 0.0, + "text": "But, one thing that we could do is maybe" + }, + { + "start": 4788.64, + "duration": 0.0, + "text": "But, one thing that we could do is maybe we<01:19:48.760> can<01:19:48.960> keep<01:19:49.200> the<01:19:49.360> keys<01:19:49.880> and<01:19:50.040> the<01:19:50.160> Vs<01:19:50.520> the<01:19:50.640> same" + }, + { + "start": 4791.11, + "duration": 0.0, + "text": "we can keep the keys and the Vs the same" + }, + { + "start": 4791.12, + "duration": 0.0, + "text": "we can keep the keys and the Vs the same across<01:19:51.480> all<01:19:51.640> the<01:19:51.760> heads,<01:19:52.480> and<01:19:52.600> the<01:19:52.760> only<01:19:53.040> thing" + }, + { + "start": 4793.15, + "duration": 0.0, + "text": "across all the heads, and the only thing" + }, + { + "start": 4793.16, + "duration": 0.0, + "text": "across all the heads, and the only thing that's<01:19:53.360> different<01:19:53.720> across<01:19:54.040> the<01:19:54.160> heads<01:19:54.800> are" + }, + { + "start": 4794.91, + "duration": 0.0, + "text": "that's different across the heads are" + }, + { + "start": 4794.92, + "duration": 0.0, + "text": "that's different across the heads are the<01:19:55.040> queries.<01:19:56.040> If<01:19:56.200> we<01:19:56.360> do<01:19:56.640> this,<01:19:57.400> then<01:19:57.680> this" + }, + { + "start": 4797.91, + "duration": 0.0, + "text": "the queries. If we do this, then this" + }, + { + "start": 4797.92, + "duration": 0.0, + "text": "the queries. If we do this, then this drastically<01:19:58.800> removes<01:19:59.640> the<01:19:59.720> amount<01:20:00.160> of<01:20:01.080> items" + }, + { + "start": 4801.43, + "duration": 0.0, + "text": "drastically removes the amount of items" + }, + { + "start": 4801.44, + "duration": 0.0, + "text": "drastically removes the amount of items that<01:20:01.560> need<01:20:01.680> to<01:20:01.720> be<01:20:01.800> moved<01:20:02.120> in<01:20:02.240> and<01:20:02.360> out<01:20:02.480> of" + }, + { + "start": 4802.55, + "duration": 0.0, + "text": "that need to be moved in and out of" + }, + { + "start": 4802.56, + "duration": 0.0, + "text": "that need to be moved in and out of memory,<01:20:02.960> right?<01:20:03.160> Because<01:20:03.400> the<01:20:03.520> KV<01:20:03.960> cache," + }, + { + "start": 4804.51, + "duration": 0.0, + "text": "memory, right? Because the KV cache," + }, + { + "start": 4804.52, + "duration": 0.0, + "text": "memory, right? Because the KV cache, right?<01:20:05.080> Are<01:20:05.200> now<01:20:05.760> significantly<01:20:06.560> smaller." + }, + { + "start": 4806.99, + "duration": 0.0, + "text": "right? Are now significantly smaller." + }, + { + "start": 4807.0, + "duration": 0.0, + "text": "right? Are now significantly smaller. These<01:20:07.200> are<01:20:07.320> all<01:20:07.560> shared<01:20:07.840> across<01:20:08.200> all<01:20:08.440> the" + }, + { + "start": 4808.55, + "duration": 0.0, + "text": "These are all shared across all the" + }, + { + "start": 4808.56, + "duration": 0.0, + "text": "These are all shared across all the heads.<01:20:09.480> Um<01:20:10.160> this<01:20:10.480> significantly<01:20:11.160> reduces<01:20:11.600> the" + }, + { + "start": 4811.71, + "duration": 0.0, + "text": "heads. Um this significantly reduces the" + }, + { + "start": 4811.72, + "duration": 0.0, + "text": "heads. Um this significantly reduces the total<01:20:12.000> memory<01:20:12.400> access<01:20:13.000> as<01:20:13.200> well<01:20:13.440> as<01:20:13.560> the" + }, + { + "start": 4813.67, + "duration": 0.0, + "text": "total memory access as well as the" + }, + { + "start": 4813.68, + "duration": 0.0, + "text": "total memory access as well as the arithmetic<01:20:14.080> intensity,<01:20:14.960> and<01:20:15.080> we're<01:20:15.200> kind<01:20:15.440> of" + }, + { + "start": 4816.19, + "duration": 0.0, + "text": "arithmetic intensity, and we're kind of" + }, + { + "start": 4816.2, + "duration": 0.0, + "text": "arithmetic intensity, and we're kind of the<01:20:16.320> key<01:20:16.560> term<01:20:16.840> that<01:20:16.960> we<01:20:17.040> were<01:20:17.120> talking<01:20:17.480> about" + }, + { + "start": 4817.75, + "duration": 0.0, + "text": "the key term that we were talking about" + }, + { + "start": 4817.76, + "duration": 0.0, + "text": "the key term that we were talking about here,<01:20:18.280> we<01:20:18.400> had<01:20:18.520> the<01:20:18.680> N<01:20:18.840> over<01:20:19.160> D<01:20:19.400> term.<01:20:20.120> Now,<01:20:20.360> we" + }, + { + "start": 4820.47, + "duration": 0.0, + "text": "here, we had the N over D term. Now, we" + }, + { + "start": 4820.48, + "duration": 0.0, + "text": "here, we had the N over D term. Now, we have<01:20:21.080> uh<01:20:21.560> H<01:20:22.160> multiplying<01:20:22.840> this,<01:20:23.120> right?<01:20:23.360> And" + }, + { + "start": 4823.47, + "duration": 0.0, + "text": "have uh H multiplying this, right? And" + }, + { + "start": 4823.48, + "duration": 0.0, + "text": "have uh H multiplying this, right? And so,<01:20:23.600> this<01:20:24.200> H<01:20:24.520> term<01:20:25.280> allows<01:20:25.680> us<01:20:25.800> to" + }, + { + "start": 4825.87, + "duration": 0.0, + "text": "so, this H term allows us to" + }, + { + "start": 4825.88, + "duration": 0.0, + "text": "so, this H term allows us to significantly<01:20:26.600> reduce<01:20:27.360> the<01:20:28.160> the<01:20:29.200> I<01:20:29.240> sorry," + }, + { + "start": 4829.47, + "duration": 0.0, + "text": "significantly reduce the the I sorry," + }, + { + "start": 4829.48, + "duration": 0.0, + "text": "significantly reduce the the I sorry, increase<01:20:29.840> the<01:20:29.920> arithmetic<01:20:30.280> intensity<01:20:31.000> if<01:20:31.120> we" + }, + { + "start": 4831.27, + "duration": 0.0, + "text": "increase the arithmetic intensity if we" + }, + { + "start": 4831.28, + "duration": 0.0, + "text": "increase the arithmetic intensity if we have<01:20:31.560> a<01:20:31.640> lot<01:20:31.880> of<01:20:32.000> heads,<01:20:32.360> right?<01:20:32.480> This<01:20:32.600> is<01:20:32.680> a" + }, + { + "start": 4832.71, + "duration": 0.0, + "text": "have a lot of heads, right? This is a" + }, + { + "start": 4832.72, + "duration": 0.0, + "text": "have a lot of heads, right? This is a significant<01:20:33.240> gain<01:20:33.800> over<01:20:34.040> what<01:20:34.240> we<01:20:34.360> had" + }, + { + "start": 4835.23, + "duration": 0.0, + "text": "significant gain over what we had" + }, + { + "start": 4835.24, + "duration": 0.0, + "text": "significant gain over what we had before." + }, + { + "start": 4836.71, + "duration": 0.0, + "text": "before." + }, + { + "start": 4836.72, + "duration": 0.0, + "text": "before. So,<01:20:37.320> this<01:20:37.520> gets<01:20:37.720> us<01:20:38.000> significant<01:20:38.560> efficiency" + }, + { + "start": 4839.03, + "duration": 0.0, + "text": "So, this gets us significant efficiency" + }, + { + "start": 4839.04, + "duration": 0.0, + "text": "So, this gets us significant efficiency improvements," + }, + { + "start": 4840.35, + "duration": 0.0, + "text": "improvements," + }, + { + "start": 4840.36, + "duration": 0.0, + "text": "improvements, but<01:20:41.000> the<01:20:41.200> issue<01:20:41.520> with<01:20:41.720> MQA<01:20:42.880> is" + }, + { + "start": 4844.03, + "duration": 0.0, + "text": "but the issue with MQA is" + }, + { + "start": 4844.04, + "duration": 0.0, + "text": "but the issue with MQA is this<01:20:44.200> is<01:20:44.320> on<01:20:44.400> the<01:20:44.520> right<01:20:44.760> here,<01:20:45.160> you<01:20:45.280> have<01:20:45.480> one" + }, + { + "start": 4845.71, + "duration": 0.0, + "text": "this is on the right here, you have one" + }, + { + "start": 4845.72, + "duration": 0.0, + "text": "this is on the right here, you have one value<01:20:46.240> and<01:20:46.480> one<01:20:46.720> key<01:20:47.200> for<01:20:47.480> all<01:20:47.720> these<01:20:47.960> queries." + }, + { + "start": 4848.71, + "duration": 0.0, + "text": "value and one key for all these queries." + }, + { + "start": 4848.72, + "duration": 0.0, + "text": "value and one key for all these queries. You<01:20:48.920> do<01:20:49.160> in<01:20:49.280> fact<01:20:49.680> lose<01:20:49.920> significant" + }, + { + "start": 4850.43, + "duration": 0.0, + "text": "You do in fact lose significant" + }, + { + "start": 4850.44, + "duration": 0.0, + "text": "You do in fact lose significant expressive<01:20:51.000> power<01:20:51.400> if<01:20:51.520> you<01:20:51.640> do<01:20:51.840> this." + }, + { + "start": 4853.11, + "duration": 0.0, + "text": "expressive power if you do this." + }, + { + "start": 4853.12, + "duration": 0.0, + "text": "expressive power if you do this. And<01:20:53.360> so,<01:20:53.680> there's<01:20:53.880> this<01:20:54.120> trade-off<01:20:54.640> between" + }, + { + "start": 4854.95, + "duration": 0.0, + "text": "And so, there's this trade-off between" + }, + { + "start": 4854.96, + "duration": 0.0, + "text": "And so, there's this trade-off between system's<01:20:55.400> efficiency<01:20:56.120> and<01:20:56.240> expressiveness," + }, + { + "start": 4856.95, + "duration": 0.0, + "text": "system's efficiency and expressiveness," + }, + { + "start": 4856.96, + "duration": 0.0, + "text": "system's efficiency and expressiveness, and<01:20:57.080> you<01:20:57.160> might<01:20:57.440> wonder,<01:20:58.200> is<01:20:58.360> there<01:20:58.520> sort<01:20:58.680> of<01:20:58.800> a" + }, + { + "start": 4858.83, + "duration": 0.0, + "text": "and you might wonder, is there sort of a" + }, + { + "start": 4858.84, + "duration": 0.0, + "text": "and you might wonder, is there sort of a sweet<01:20:59.280> spot<01:20:59.960> in<01:21:00.160> which<01:21:00.520> we<01:21:00.600> can<01:21:00.760> avoid<01:21:01.640> trading" + }, + { + "start": 4862.07, + "duration": 0.0, + "text": "sweet spot in which we can avoid trading" + }, + { + "start": 4862.08, + "duration": 0.0, + "text": "sweet spot in which we can avoid trading off,<01:21:02.720> you<01:21:02.840> know,<01:21:03.400> quite<01:21:03.600> a<01:21:03.640> significantly" + }, + { + "start": 4865.43, + "duration": 0.0, + "text": "off, you know, quite a significantly" + }, + { + "start": 4865.44, + "duration": 0.0, + "text": "off, you know, quite a significantly expressive<01:21:06.040> power<01:21:06.640> and<01:21:06.800> computation.<01:21:07.400> And" + }, + { + "start": 4867.51, + "duration": 0.0, + "text": "expressive power and computation. And" + }, + { + "start": 4867.52, + "duration": 0.0, + "text": "expressive power and computation. And that's<01:21:07.720> where<01:21:08.240> GQA<01:21:08.800> or<01:21:08.920> grouped<01:21:09.280> query" + }, + { + "start": 4869.51, + "duration": 0.0, + "text": "that's where GQA or grouped query" + }, + { + "start": 4869.52, + "duration": 0.0, + "text": "that's where GQA or grouped query attention<01:21:09.880> comes<01:21:10.160> in.<01:21:10.720> You<01:21:10.800> know,<01:21:10.920> the" + }, + { + "start": 4871.03, + "duration": 0.0, + "text": "attention comes in. You know, the" + }, + { + "start": 4871.04, + "duration": 0.0, + "text": "attention comes in. You know, the original<01:21:11.440> transformer<01:21:12.080> is<01:21:12.240> multi-head.<01:21:12.840> We" + }, + { + "start": 4872.95, + "duration": 0.0, + "text": "original transformer is multi-head. We" + }, + { + "start": 4872.96, + "duration": 0.0, + "text": "original transformer is multi-head. We have<01:21:13.480> queries<01:21:13.880> and<01:21:14.000> keys<01:21:14.280> for<01:21:14.440> each<01:21:14.680> head.<01:21:15.320> In" + }, + { + "start": 4875.47, + "duration": 0.0, + "text": "have queries and keys for each head. In" + }, + { + "start": 4875.48, + "duration": 0.0, + "text": "have queries and keys for each head. In multi-query,<01:21:16.240> we<01:21:16.400> have<01:21:16.680> one<01:21:16.920> key<01:21:17.160> and<01:21:17.280> value" + }, + { + "start": 4877.83, + "duration": 0.0, + "text": "multi-query, we have one key and value" + }, + { + "start": 4877.84, + "duration": 0.0, + "text": "multi-query, we have one key and value for<01:21:18.080> each<01:21:18.640> for<01:21:18.840> all<01:21:19.000> the<01:21:19.080> heads.<01:21:19.680> In<01:21:19.840> grouped" + }, + { + "start": 4880.27, + "duration": 0.0, + "text": "for each for all the heads. In grouped" + }, + { + "start": 4880.28, + "duration": 0.0, + "text": "for each for all the heads. In grouped query,<01:21:20.880> we<01:21:21.040> reduce<01:21:21.520> the<01:21:21.640> amount<01:21:22.000> of<01:21:22.160> keys<01:21:22.400> and" + }, + { + "start": 4882.51, + "duration": 0.0, + "text": "query, we reduce the amount of keys and" + }, + { + "start": 4882.52, + "duration": 0.0, + "text": "query, we reduce the amount of keys and values,<01:21:23.240> but<01:21:23.360> we<01:21:23.480> keep<01:21:23.720> the<01:21:23.840> number<01:21:24.080> of" + }, + { + "start": 4884.19, + "duration": 0.0, + "text": "values, but we keep the number of" + }, + { + "start": 4884.2, + "duration": 0.0, + "text": "values, but we keep the number of queries<01:21:24.520> the<01:21:24.600> same.<01:21:24.840> So,<01:21:24.960> we<01:21:25.040> now<01:21:25.240> have<01:21:25.400> this" + }, + { + "start": 4885.59, + "duration": 0.0, + "text": "queries the same. So, we now have this" + }, + { + "start": 4885.6, + "duration": 0.0, + "text": "queries the same. So, we now have this ratio<01:21:26.040> that<01:21:26.160> we<01:21:26.240> can<01:21:26.400> play<01:21:26.640> with,<01:21:27.160> which<01:21:27.320> is" + }, + { + "start": 4887.43, + "duration": 0.0, + "text": "ratio that we can play with, which is" + }, + { + "start": 4887.44, + "duration": 0.0, + "text": "ratio that we can play with, which is kind<01:21:27.560> of<01:21:27.640> the<01:21:27.720> number<01:21:28.160> of<01:21:28.280> key<01:21:28.520> heads<01:21:28.880> or<01:21:29.000> the" + }, + { + "start": 4889.07, + "duration": 0.0, + "text": "kind of the number of key heads or the" + }, + { + "start": 4889.08, + "duration": 0.0, + "text": "kind of the number of key heads or the number<01:21:29.320> of<01:21:29.400> value<01:21:29.800> heads" + }, + { + "start": 4890.91, + "duration": 0.0, + "text": "number of value heads" + }, + { + "start": 4890.92, + "duration": 0.0, + "text": "number of value heads while<01:21:31.120> keeping<01:21:31.400> the<01:21:31.520> total<01:21:31.880> number<01:21:32.160> of<01:21:32.280> heads" + }, + { + "start": 4892.43, + "duration": 0.0, + "text": "while keeping the total number of heads" + }, + { + "start": 4892.44, + "duration": 0.0, + "text": "while keeping the total number of heads much<01:21:32.680> larger<01:21:33.080> than<01:21:33.280> that.<01:21:33.480> So,<01:21:33.560> this<01:21:33.720> allows" + }, + { + "start": 4894.15, + "duration": 0.0, + "text": "much larger than that. So, this allows" + }, + { + "start": 4894.16, + "duration": 0.0, + "text": "much larger than that. So, this allows us<01:21:34.280> to<01:21:34.400> very<01:21:34.680> simply<01:21:35.040> control<01:21:35.480> the" + }, + { + "start": 4896.19, + "duration": 0.0, + "text": "us to very simply control the" + }, + { + "start": 4896.2, + "duration": 0.0, + "text": "us to very simply control the the<01:21:36.600> trade-off<01:21:37.080> between<01:21:37.320> expressiveness<01:21:38.440> and" + }, + { + "start": 4898.55, + "duration": 0.0, + "text": "the trade-off between expressiveness and" + }, + { + "start": 4898.56, + "duration": 0.0, + "text": "the trade-off between expressiveness and inference<01:21:38.880> efficiency." + }, + { + "start": 4900.39, + "duration": 0.0, + "text": "inference efficiency." + }, + { + "start": 4900.4, + "duration": 0.0, + "text": "inference efficiency. Um<01:21:40.880> there<01:21:41.040> are<01:21:41.120> other<01:21:41.360> sort<01:21:41.520> of<01:21:41.600> tricks<01:21:41.960> from" + }, + { + "start": 4902.11, + "duration": 0.0, + "text": "Um there are other sort of tricks from" + }, + { + "start": 4902.12, + "duration": 0.0, + "text": "Um there are other sort of tricks from DeepSeek-V2,<01:21:43.320> multi-head<01:21:43.800> latent" + }, + { + "start": 4904.07, + "duration": 0.0, + "text": "DeepSeek-V2, multi-head latent" + }, + { + "start": 4904.08, + "duration": 0.0, + "text": "DeepSeek-V2, multi-head latent attention,<01:21:45.080> that<01:21:45.240> I'll<01:21:45.560> sort<01:21:45.720> of<01:21:45.800> mention" + }, + { + "start": 4906.11, + "duration": 0.0, + "text": "attention, that I'll sort of mention" + }, + { + "start": 4906.12, + "duration": 0.0, + "text": "attention, that I'll sort of mention briefly<01:21:46.880> next<01:21:47.240> time," + }, + { + "start": 4908.79, + "duration": 0.0, + "text": "briefly next time," + }, + { + "start": 4908.8, + "duration": 0.0, + "text": "briefly next time, which<01:21:49.200> sort<01:21:49.360> of<01:21:49.480> have<01:21:49.600> a<01:21:49.680> different<01:21:50.120> kind<01:21:50.280> of" + }, + { + "start": 4910.35, + "duration": 0.0, + "text": "which sort of have a different kind of" + }, + { + "start": 4910.36, + "duration": 0.0, + "text": "which sort of have a different kind of factorization<01:21:51.000> structure<01:21:51.560> and<01:21:51.680> a<01:21:51.720> different" + }, + { + "start": 4912.03, + "duration": 0.0, + "text": "factorization structure and a different" + }, + { + "start": 4912.04, + "duration": 0.0, + "text": "factorization structure and a different set<01:21:52.160> of<01:21:52.280> trade-offs.<01:21:53.200> But<01:21:53.440> really," + }, + { + "start": 4914.71, + "duration": 0.0, + "text": "set of trade-offs. But really," + }, + { + "start": 4914.72, + "duration": 0.0, + "text": "set of trade-offs. But really, the<01:21:54.840> nice<01:21:55.080> thing<01:21:55.200> about<01:21:55.560> GQA<01:21:56.640> is<01:21:56.800> that<01:21:56.920> in" + }, + { + "start": 4917.03, + "duration": 0.0, + "text": "the nice thing about GQA is that in" + }, + { + "start": 4917.04, + "duration": 0.0, + "text": "the nice thing about GQA is that in practice<01:21:57.480> the<01:21:57.600> trade-off<01:21:58.040> is<01:21:58.200> quite" + }, + { + "start": 4918.55, + "duration": 0.0, + "text": "practice the trade-off is quite" + }, + { + "start": 4918.56, + "duration": 0.0, + "text": "practice the trade-off is quite favorable.<01:21:59.200> So,<01:21:59.320> if<01:21:59.440> you<01:21:59.560> have<01:22:00.120> multi-head," + }, + { + "start": 4920.99, + "duration": 0.0, + "text": "favorable. So, if you have multi-head," + }, + { + "start": 4921.0, + "duration": 0.0, + "text": "favorable. So, if you have multi-head, your<01:22:01.120> performance,<01:22:01.760> this<01:22:01.920> was<01:22:02.280> you<01:22:02.400> know,<01:22:02.520> in" + }, + { + "start": 4922.67, + "duration": 0.0, + "text": "your performance, this was you know, in" + }, + { + "start": 4922.68, + "duration": 0.0, + "text": "your performance, this was you know, in the<01:22:03.320> I<01:22:03.400> think<01:22:03.600> T5<01:22:04.160> days<01:22:04.440> if<01:22:04.560> I<01:22:04.640> remember<01:22:04.960> right." + }, + { + "start": 4925.95, + "duration": 0.0, + "text": "the I think T5 days if I remember right." + }, + { + "start": 4925.96, + "duration": 0.0, + "text": "the I think T5 days if I remember right. This<01:22:06.120> is<01:22:06.200> your<01:22:06.800> downstream<01:22:07.480> model" + }, + { + "start": 4927.79, + "duration": 0.0, + "text": "This is your downstream model" + }, + { + "start": 4927.8, + "duration": 0.0, + "text": "This is your downstream model performance.<01:22:08.680> This<01:22:08.840> is<01:22:08.960> your<01:22:09.080> time<01:22:09.400> per" + }, + { + "start": 4929.51, + "duration": 0.0, + "text": "performance. This is your time per" + }, + { + "start": 4929.52, + "duration": 0.0, + "text": "performance. This is your time per sample.<01:22:09.960> You<01:22:10.040> want<01:22:10.160> to<01:22:10.240> reduce<01:22:10.600> this<01:22:10.760> as<01:22:10.880> much" + }, + { + "start": 4931.03, + "duration": 0.0, + "text": "sample. You want to reduce this as much" + }, + { + "start": 4931.04, + "duration": 0.0, + "text": "sample. You want to reduce this as much as<01:22:11.160> possible." + }, + { + "start": 4932.35, + "duration": 0.0, + "text": "as possible." + }, + { + "start": 4932.36, + "duration": 0.0, + "text": "as possible. With<01:22:12.520> multi-head<01:22:12.920> attention,<01:22:13.320> you<01:22:13.400> have<01:22:13.520> best" + }, + { + "start": 4933.75, + "duration": 0.0, + "text": "With multi-head attention, you have best" + }, + { + "start": 4933.76, + "duration": 0.0, + "text": "With multi-head attention, you have best performance<01:22:14.240> but<01:22:14.440> very<01:22:14.720> high<01:22:14.920> cost." + }, + { + "start": 4936.35, + "duration": 0.0, + "text": "performance but very high cost." + }, + { + "start": 4936.36, + "duration": 0.0, + "text": "performance but very high cost. With<01:22:16.560> MQA,<01:22:17.680> you<01:22:17.800> know,<01:22:17.920> you<01:22:18.080> have" + }, + { + "start": 4939.15, + "duration": 0.0, + "text": "With MQA, you know, you have" + }, + { + "start": 4939.16, + "duration": 0.0, + "text": "With MQA, you know, you have lower<01:22:20.120> cost<01:22:20.520> but<01:22:20.680> much<01:22:20.920> lower<01:22:21.160> performance." + }, + { + "start": 4942.11, + "duration": 0.0, + "text": "lower cost but much lower performance." + }, + { + "start": 4942.12, + "duration": 0.0, + "text": "lower cost but much lower performance. Similarly,<01:22:22.800> if<01:22:22.960> you<01:22:23.040> make<01:22:23.200> your<01:22:23.320> model" + }, + { + "start": 4943.59, + "duration": 0.0, + "text": "Similarly, if you make your model" + }, + { + "start": 4943.6, + "duration": 0.0, + "text": "Similarly, if you make your model smaller<01:22:24.200> to<01:22:24.320> try<01:22:24.480> to<01:22:24.560> hit<01:22:24.720> your<01:22:25.240> performance" + }, + { + "start": 4945.71, + "duration": 0.0, + "text": "smaller to try to hit your performance" + }, + { + "start": 4945.72, + "duration": 0.0, + "text": "smaller to try to hit your performance targets,<01:22:26.040> you<01:22:26.080> get<01:22:26.240> much<01:22:26.440> worse<01:22:26.600> performance." + }, + { + "start": 4947.47, + "duration": 0.0, + "text": "targets, you get much worse performance." + }, + { + "start": 4947.48, + "duration": 0.0, + "text": "targets, you get much worse performance. GQA<01:22:28.080> really<01:22:28.360> does<01:22:28.560> get<01:22:28.760> the<01:22:28.840> best<01:22:29.080> of<01:22:29.160> both" + }, + { + "start": 4949.39, + "duration": 0.0, + "text": "GQA really does get the best of both" + }, + { + "start": 4949.4, + "duration": 0.0, + "text": "GQA really does get the best of both worlds,<01:22:29.880> you<01:22:29.960> know,<01:22:30.520> very<01:22:31.000> low<01:22:31.240> inference" + }, + { + "start": 4951.59, + "duration": 0.0, + "text": "worlds, you know, very low inference" + }, + { + "start": 4951.6, + "duration": 0.0, + "text": "worlds, you know, very low inference cost,<01:22:32.560> nearly<01:22:33.040> the<01:22:33.160> same<01:22:33.360> performance<01:22:34.280> as" + }, + { + "start": 4954.43, + "duration": 0.0, + "text": "cost, nearly the same performance as" + }, + { + "start": 4954.44, + "duration": 0.0, + "text": "cost, nearly the same performance as your<01:22:34.560> full<01:22:34.800> multi-head." + }, + { + "start": 4956.35, + "duration": 0.0, + "text": "your full multi-head." + }, + { + "start": 4956.36, + "duration": 0.0, + "text": "your full multi-head. Um<01:22:37.000> and<01:22:37.120> you<01:22:37.240> see<01:22:37.400> sort<01:22:37.560> of<01:22:37.640> this<01:22:37.840> like<01:22:38.600> GQA" + }, + { + "start": 4959.67, + "duration": 0.0, + "text": "Um and you see sort of this like GQA" + }, + { + "start": 4959.68, + "duration": 0.0, + "text": "Um and you see sort of this like GQA group<01:22:39.960> structure<01:22:40.320> where<01:22:40.560> if<01:22:40.720> you<01:22:40.840> have" + }, + { + "start": 4962.47, + "duration": 0.0, + "text": "group structure where if you have" + }, + { + "start": 4962.48, + "duration": 0.0, + "text": "group structure where if you have a<01:22:42.600> small<01:22:42.960> reduction<01:22:43.360> in<01:22:43.440> the<01:22:43.520> number<01:22:43.840> of" + }, + { + "start": 4963.99, + "duration": 0.0, + "text": "a small reduction in the number of" + }, + { + "start": 4964.0, + "duration": 0.0, + "text": "a small reduction in the number of heads," + }, + { + "start": 4965.07, + "duration": 0.0, + "text": "heads," + }, + { + "start": 4965.08, + "duration": 0.0, + "text": "heads, you<01:22:45.200> basically<01:22:45.680> have<01:22:46.200> most<01:22:46.640> of<01:22:46.720> the<01:22:46.800> gains<01:22:47.480> in" + }, + { + "start": 4967.55, + "duration": 0.0, + "text": "you basically have most of the gains in" + }, + { + "start": 4967.56, + "duration": 0.0, + "text": "you basically have most of the gains in your<01:22:47.680> performance,<01:22:48.400> which<01:22:48.600> allows<01:22:48.920> you<01:22:48.960> to" + }, + { + "start": 4969.03, + "duration": 0.0, + "text": "your performance, which allows you to" + }, + { + "start": 4969.04, + "duration": 0.0, + "text": "your performance, which allows you to sort<01:22:49.160> of<01:22:49.240> keep<01:22:49.400> most<01:22:49.640> of<01:22:49.720> the<01:22:49.760> expressive" + }, + { + "start": 4970.27, + "duration": 0.0, + "text": "sort of keep most of the expressive" + }, + { + "start": 4970.28, + "duration": 0.0, + "text": "sort of keep most of the expressive power" + }, + { + "start": 4971.47, + "duration": 0.0, + "text": "power" + }, + { + "start": 4971.48, + "duration": 0.0, + "text": "power while<01:22:51.720> getting<01:22:52.160> significant<01:22:53.160> inference" + }, + { + "start": 4973.55, + "duration": 0.0, + "text": "while getting significant inference" + }, + { + "start": 4973.56, + "duration": 0.0, + "text": "while getting significant inference improvements.<01:22:54.200> And<01:22:54.600> Percy<01:22:54.840> will<01:22:54.960> talk<01:22:55.320> a" + }, + { + "start": 4975.35, + "duration": 0.0, + "text": "improvements. And Percy will talk a" + }, + { + "start": 4975.36, + "duration": 0.0, + "text": "improvements. And Percy will talk a bunch<01:22:55.640> more<01:22:55.800> about<01:22:56.080> sort<01:22:56.200> of<01:22:56.280> the<01:22:56.360> inference" + }, + { + "start": 4976.75, + "duration": 0.0, + "text": "bunch more about sort of the inference" + }, + { + "start": 4976.76, + "duration": 0.0, + "text": "bunch more about sort of the inference mechanics<01:22:57.280> later," + }, + { + "start": 4978.67, + "duration": 0.0, + "text": "mechanics later," + }, + { + "start": 4978.68, + "duration": 0.0, + "text": "mechanics later, but<01:22:58.840> sort<01:22:58.960> of<01:22:59.040> this<01:22:59.200> should<01:22:59.360> give<01:22:59.520> you<01:22:59.600> a" + }, + { + "start": 4979.67, + "duration": 0.0, + "text": "but sort of this should give you a" + }, + { + "start": 4979.68, + "duration": 0.0, + "text": "but sort of this should give you a flavor<01:23:00.040> of<01:23:00.160> like<01:23:00.400> why<01:23:01.160> models<01:23:01.600> today<01:23:01.880> almost" + }, + { + "start": 4982.27, + "duration": 0.0, + "text": "flavor of like why models today almost" + }, + { + "start": 4982.28, + "duration": 0.0, + "text": "flavor of like why models today almost all<01:23:02.440> adopt<01:23:02.800> this<01:23:02.960> GQA<01:23:04.080> structure<01:23:04.600> because<01:23:04.840> it" + }, + { + "start": 4984.91, + "duration": 0.0, + "text": "all adopt this GQA structure because it" + }, + { + "start": 4984.92, + "duration": 0.0, + "text": "all adopt this GQA structure because it gives<01:23:05.160> you<01:23:05.280> a<01:23:05.360> lot<01:23:05.560> of<01:23:05.640> this<01:23:05.800> inference<01:23:06.200> cost," + }, + { + "start": 4986.47, + "duration": 0.0, + "text": "gives you a lot of this inference cost," + }, + { + "start": 4986.48, + "duration": 0.0, + "text": "gives you a lot of this inference cost, which<01:23:06.640> is<01:23:06.720> really<01:23:06.920> critical,<01:23:07.840> without<01:23:08.680> very" + }, + { + "start": 4988.91, + "duration": 0.0, + "text": "which is really critical, without very" + }, + { + "start": 4988.92, + "duration": 0.0, + "text": "which is really critical, without very much<01:23:09.320> of<01:23:09.640> a" + }, + { + "start": 4991.19, + "duration": 0.0, + "text": "much of a" + }, + { + "start": 4991.2, + "duration": 0.0, + "text": "much of a expressiveness<01:23:11.920> hit." + }, + { + "start": 4994.44, + "duration": 0.0, + "text": "Cool.<01:23:14.880> Any<01:23:15.040> questions<01:23:15.400> for<01:23:15.720> for<01:23:15.880> GQA<01:23:16.840> or<01:23:16.960> KV" + }, + { + "start": 4997.19, + "duration": 0.0, + "text": "Cool. Any questions for for GQA or KV" + }, + { + "start": 4997.2, + "duration": 0.0, + "text": "Cool. Any questions for for GQA or KV cache?<01:23:17.440> Yeah." + }, + { + "start": 4999.76, + "duration": 0.0, + "text": "Given" + }, + { + "start": 5001.55, + "duration": 0.0, + "text": "Given" + }, + { + "start": 5001.56, + "duration": 0.0, + "text": "Given that<01:23:21.680> you<01:23:21.760> have<01:23:22.000> so<01:23:22.200> many" + }, + { + "start": 5003.79, + "duration": 0.0, + "text": "that you have so many" + }, + { + "start": 5003.8, + "duration": 0.0, + "text": "that you have so many like<01:23:23.960> rules<01:23:24.240> of<01:23:24.320> thumb<01:23:24.560> for<01:23:24.800> what" + }, + { + "start": 5004.95, + "duration": 0.0, + "text": "like rules of thumb for what" + }, + { + "start": 5004.96, + "duration": 0.0, + "text": "like rules of thumb for what hyper-parameters<01:23:25.600> are<01:23:25.720> good,<01:23:26.120> like<01:23:26.680> to<01:23:26.800> what" + }, + { + "start": 5006.95, + "duration": 0.0, + "text": "hyper-parameters are good, like to what" + }, + { + "start": 5006.96, + "duration": 0.0, + "text": "hyper-parameters are good, like to what extent<01:23:27.200> are<01:23:27.280> you<01:23:27.360> still<01:23:27.640> searching<01:23:28.080> over" + }, + { + "start": 5008.27, + "duration": 0.0, + "text": "extent are you still searching over" + }, + { + "start": 5008.28, + "duration": 0.0, + "text": "extent are you still searching over hyper-parameters<01:23:29.360> versus<01:23:30.440> exploiting<01:23:31.000> these" + }, + { + "start": 5011.23, + "duration": 0.0, + "text": "hyper-parameters versus exploiting these" + }, + { + "start": 5011.24, + "duration": 0.0, + "text": "hyper-parameters versus exploiting these rules<01:23:31.440> of<01:23:31.520> thumb<01:23:31.720> that<01:23:31.800> you" + }, + { + "start": 5013.83, + "duration": 0.0, + "text": "rules of thumb that you" + }, + { + "start": 5013.84, + "duration": 0.0, + "text": "rules of thumb that you I<01:23:33.920> think<01:23:34.120> it's<01:23:34.240> a<01:23:34.320> mix<01:23:34.600> of<01:23:34.760> both.<01:23:35.160> I<01:23:35.240> think" + }, + { + "start": 5015.43, + "duration": 0.0, + "text": "I think it's a mix of both. I think" + }, + { + "start": 5015.44, + "duration": 0.0, + "text": "I think it's a mix of both. I think every<01:23:35.920> sort<01:23:36.120> of<01:23:36.200> model<01:23:36.520> training<01:23:36.920> run<01:23:37.200> has" + }, + { + "start": 5017.31, + "duration": 0.0, + "text": "every sort of model training run has" + }, + { + "start": 5017.32, + "duration": 0.0, + "text": "every sort of model training run has some<01:23:37.520> theses<01:23:37.960> about<01:23:38.240> what<01:23:38.360> can<01:23:38.520> be<01:23:38.640> varied." + }, + { + "start": 5020.07, + "duration": 0.0, + "text": "some theses about what can be varied." + }, + { + "start": 5020.08, + "duration": 0.0, + "text": "some theses about what can be varied. And<01:23:40.200> so,<01:23:40.320> you<01:23:40.440> see<01:23:40.600> this<01:23:40.800> in<01:23:40.920> a<01:23:40.960> lot<01:23:41.160> of<01:23:41.240> the" + }, + { + "start": 5021.31, + "duration": 0.0, + "text": "And so, you see this in a lot of the" + }, + { + "start": 5021.32, + "duration": 0.0, + "text": "And so, you see this in a lot of the reports<01:23:42.040> where<01:23:42.360> I<01:23:42.400> think<01:23:42.680> the" + }, + { + "start": 5022.79, + "duration": 0.0, + "text": "reports where I think the" + }, + { + "start": 5022.8, + "duration": 0.0, + "text": "reports where I think the hyper-parameters<01:23:43.560> are<01:23:43.680> often<01:23:43.960> not<01:23:44.160> where" + }, + { + "start": 5024.27, + "duration": 0.0, + "text": "hyper-parameters are often not where" + }, + { + "start": 5024.28, + "duration": 0.0, + "text": "hyper-parameters are often not where people<01:23:44.560> are<01:23:44.640> touching<01:23:45.000> too<01:23:45.160> much." + }, + { + "start": 5026.31, + "duration": 0.0, + "text": "people are touching too much." + }, + { + "start": 5026.32, + "duration": 0.0, + "text": "people are touching too much. But<01:23:46.520> you<01:23:46.600> see<01:23:46.760> like<01:23:46.960> architecture<01:23:47.400> changes" + }, + { + "start": 5027.91, + "duration": 0.0, + "text": "But you see like architecture changes" + }, + { + "start": 5027.92, + "duration": 0.0, + "text": "But you see like architecture changes like<01:23:48.080> one<01:23:48.240> at<01:23:48.360> a<01:23:48.400> time<01:23:49.200> in<01:23:49.320> a<01:23:49.360> lot<01:23:49.560> of<01:23:49.640> these" + }, + { + "start": 5029.79, + "duration": 0.0, + "text": "like one at a time in a lot of these" + }, + { + "start": 5029.8, + "duration": 0.0, + "text": "like one at a time in a lot of these reports.<01:23:50.240> But<01:23:50.360> it's<01:23:50.480> very<01:23:50.800> rare<01:23:51.120> to<01:23:51.240> like<01:23:51.840> go" + }, + { + "start": 5031.99, + "duration": 0.0, + "text": "reports. But it's very rare to like go" + }, + { + "start": 5032.0, + "duration": 0.0, + "text": "reports. But it's very rare to like go and<01:23:52.120> change<01:23:52.400> everything<01:23:52.760> up.<01:23:53.200> I<01:23:53.240> think<01:23:53.400> Google" + }, + { + "start": 5033.63, + "duration": 0.0, + "text": "and change everything up. I think Google" + }, + { + "start": 5033.64, + "duration": 0.0, + "text": "and change everything up. I think Google is<01:23:53.720> one<01:23:53.880> of<01:23:53.960> the<01:23:54.040> only<01:23:54.920> orgs<01:23:55.400> that<01:23:55.560> seems<01:23:55.840> to" + }, + { + "start": 5035.91, + "duration": 0.0, + "text": "is one of the only orgs that seems to" + }, + { + "start": 5035.92, + "duration": 0.0, + "text": "is one of the only orgs that seems to like<01:23:56.120> really<01:23:56.360> spice<01:23:56.680> things<01:23:56.920> up<01:23:57.040> in<01:23:57.120> a" + }, + { + "start": 5037.15, + "duration": 0.0, + "text": "like really spice things up in a" + }, + { + "start": 5037.16, + "duration": 0.0, + "text": "like really spice things up in a significant<01:23:57.720> way." + }, + { + "start": 5038.67, + "duration": 0.0, + "text": "significant way." + }, + { + "start": 5038.68, + "duration": 0.0, + "text": "significant way. The<01:23:58.800> Gemma<01:23:59.080> series<01:23:59.840> has<01:24:00.000> done<01:24:00.120> some<01:24:00.240> pretty" + }, + { + "start": 5040.59, + "duration": 0.0, + "text": "The Gemma series has done some pretty" + }, + { + "start": 5040.6, + "duration": 0.0, + "text": "The Gemma series has done some pretty interesting<01:24:01.080> things." + }, + { + "start": 5042.63, + "duration": 0.0, + "text": "interesting things." + }, + { + "start": 5042.64, + "duration": 0.0, + "text": "interesting things. The<01:24:02.760> most<01:24:03.000> recent<01:24:03.280> Gemma<01:24:03.520> 4<01:24:03.680> release,<01:24:04.000> for" + }, + { + "start": 5044.07, + "duration": 0.0, + "text": "The most recent Gemma 4 release, for" + }, + { + "start": 5044.08, + "duration": 0.0, + "text": "The most recent Gemma 4 release, for example,<01:24:04.440> now<01:24:04.640> has<01:24:04.840> like<01:24:05.480> individual" + }, + { + "start": 5046.31, + "duration": 0.0, + "text": "example, now has like individual" + }, + { + "start": 5046.32, + "duration": 0.0, + "text": "example, now has like individual embedding<01:24:06.680> for<01:24:06.800> every<01:24:07.040> layer" + }, + { + "start": 5048.07, + "duration": 0.0, + "text": "embedding for every layer" + }, + { + "start": 5048.08, + "duration": 0.0, + "text": "embedding for every layer in<01:24:08.200> a<01:24:08.280> way<01:24:08.440> to<01:24:08.600> control<01:24:09.040> the<01:24:09.160> trade-offs" + }, + { + "start": 5049.59, + "duration": 0.0, + "text": "in a way to control the trade-offs" + }, + { + "start": 5049.6, + "duration": 0.0, + "text": "in a way to control the trade-offs between<01:24:09.880> like<01:24:10.040> memory<01:24:10.440> use<01:24:10.680> and<01:24:10.800> flops." + }, + { + "start": 5051.11, + "duration": 0.0, + "text": "between like memory use and flops." + }, + { + "start": 5051.12, + "duration": 0.0, + "text": "between like memory use and flops. They're<01:24:11.240> very<01:24:11.480> interesting<01:24:11.880> set<01:24:12.000> of<01:24:12.080> things" + }, + { + "start": 5052.31, + "duration": 0.0, + "text": "They're very interesting set of things" + }, + { + "start": 5052.32, + "duration": 0.0, + "text": "They're very interesting set of things that<01:24:12.440> they've<01:24:12.600> done." + }, + { + "start": 5054.79, + "duration": 0.0, + "text": "that they've done." + }, + { + "start": 5054.8, + "duration": 0.0, + "text": "that they've done. Oh,<01:24:14.880> yeah,<01:24:15.080> back<01:24:15.280> there." + }, + { + "start": 5056.67, + "duration": 0.0, + "text": "Oh, yeah, back there." + }, + { + "start": 5056.68, + "duration": 0.0, + "text": "Oh, yeah, back there. Do<01:24:16.800> you<01:24:16.840> experiment<01:24:17.480> with<01:24:17.640> like<01:24:17.840> data<01:24:18.440> all" + }, + { + "start": 5058.63, + "duration": 0.0, + "text": "Do you experiment with like data all" + }, + { + "start": 5058.64, + "duration": 0.0, + "text": "Do you experiment with like data all things<01:24:18.960> of<01:24:19.120> these<01:24:20.080> parameters" + }, + { + "start": 5061.23, + "duration": 0.0, + "text": "things of these parameters" + }, + { + "start": 5061.24, + "duration": 0.0, + "text": "things of these parameters during<01:24:21.520> training?" + }, + { + "start": 5062.43, + "duration": 0.0, + "text": "during training?" + }, + { + "start": 5062.44, + "duration": 0.0, + "text": "during training? During<01:24:22.720> training," + }, + { + "start": 5063.63, + "duration": 0.0, + "text": "During training," + }, + { + "start": 5063.64, + "duration": 0.0, + "text": "During training, let<01:24:23.760> me<01:24:23.840> think." + }, + { + "start": 5066.4, + "duration": 0.0, + "text": "Weight<01:24:26.600> decay,<01:24:27.280> yes.<01:24:27.840> Weight<01:24:28.000> decay,<01:24:28.400> people" + }, + { + "start": 5068.75, + "duration": 0.0, + "text": "Weight decay, yes. Weight decay, people" + }, + { + "start": 5068.76, + "duration": 0.0, + "text": "Weight decay, yes. Weight decay, people change<01:24:29.520> in<01:24:29.720> concert<01:24:30.280> with<01:24:30.440> like<01:24:30.680> learning" + }, + { + "start": 5071.07, + "duration": 0.0, + "text": "change in concert with like learning" + }, + { + "start": 5071.08, + "duration": 0.0, + "text": "change in concert with like learning rate." + }, + { + "start": 5072.11, + "duration": 0.0, + "text": "rate." + }, + { + "start": 5072.12, + "duration": 0.0, + "text": "rate. That<01:24:32.240> is<01:24:32.360> actually<01:24:32.640> a<01:24:32.680> heuristic<01:24:33.200> that<01:24:33.360> people" + }, + { + "start": 5073.63, + "duration": 0.0, + "text": "That is actually a heuristic that people" + }, + { + "start": 5073.64, + "duration": 0.0, + "text": "That is actually a heuristic that people do<01:24:34.240> that<01:24:34.360> works<01:24:34.560> very<01:24:34.760> well." + }, + { + "start": 5075.75, + "duration": 0.0, + "text": "do that works very well." + }, + { + "start": 5075.76, + "duration": 0.0, + "text": "do that works very well. Um<01:24:36.720> other<01:24:37.040> than<01:24:37.480> that,<01:24:38.040> I<01:24:38.080> don't<01:24:38.360> know<01:24:38.640> if" + }, + { + "start": 5078.83, + "duration": 0.0, + "text": "Um other than that, I don't know if" + }, + { + "start": 5078.84, + "duration": 0.0, + "text": "Um other than that, I don't know if there's<01:24:39.080> a<01:24:39.160> lot<01:24:39.640> of<01:24:39.760> different<01:24:40.200> hypers<01:24:40.720> that" + }, + { + "start": 5080.87, + "duration": 0.0, + "text": "there's a lot of different hypers that" + }, + { + "start": 5080.88, + "duration": 0.0, + "text": "there's a lot of different hypers that people<01:24:41.160> change<01:24:41.840> during<01:24:42.120> training," + }, + { + "start": 5082.51, + "duration": 0.0, + "text": "people change during training," + }, + { + "start": 5082.52, + "duration": 0.0, + "text": "people change during training, especially<01:24:42.880> because<01:24:43.040> the<01:24:43.160> architecture<01:24:43.640> ones" + }, + { + "start": 5084.23, + "duration": 0.0, + "text": "especially because the architecture ones" + }, + { + "start": 5084.24, + "duration": 0.0, + "text": "especially because the architecture ones just<01:24:44.400> make<01:24:44.600> training<01:24:44.880> incompatible.<01:24:45.880> So,<01:24:45.960> you" + }, + { + "start": 5086.03, + "duration": 0.0, + "text": "just make training incompatible. So, you" + }, + { + "start": 5086.04, + "duration": 0.0, + "text": "just make training incompatible. So, you can't<01:24:46.240> really,<01:24:46.520> you<01:24:46.600> know,<01:24:46.680> change<01:24:47.000> them" + }, + { + "start": 5087.11, + "duration": 0.0, + "text": "can't really, you know, change them" + }, + { + "start": 5087.12, + "duration": 0.0, + "text": "can't really, you know, change them while<01:24:47.280> you're<01:24:47.480> you're<01:24:47.600> training." + }, + { + "start": 5089.47, + "duration": 0.0, + "text": "while you're you're training." + }, + { + "start": 5089.48, + "duration": 0.0, + "text": "while you're you're training. Yeah,<01:24:49.800> so<01:24:49.960> I<01:24:50.000> think<01:24:50.120> I<01:24:50.160> think<01:24:50.400> weight<01:24:50.600> decay<01:24:50.840> is" + }, + { + "start": 5090.95, + "duration": 0.0, + "text": "Yeah, so I think I think weight decay is" + }, + { + "start": 5090.96, + "duration": 0.0, + "text": "Yeah, so I think I think weight decay is probably<01:24:51.120> the<01:24:51.240> one<01:24:51.440> that<01:24:51.560> I<01:24:51.600> can<01:24:51.760> think<01:24:51.960> of." + }, + { + "start": 5092.07, + "duration": 0.0, + "text": "probably the one that I can think of." + }, + { + "start": 5092.08, + "duration": 0.0, + "text": "probably the one that I can think of. The<01:24:52.200> others<01:24:52.520> are<01:24:52.640> usually<01:24:53.000> fixed." + }, + { + "start": 5094.79, + "duration": 0.0, + "text": "The others are usually fixed." + }, + { + "start": 5094.8, + "duration": 0.0, + "text": "The others are usually fixed. Yeah." + }, + { + "start": 5096.27, + "duration": 0.0, + "text": "Yeah." + }, + { + "start": 5096.28, + "duration": 0.0, + "text": "Yeah. Yeah,<01:24:56.360> MQA<01:24:56.760> is<01:24:57.160> uh" + }, + { + "start": 5098.39, + "duration": 0.0, + "text": "Yeah, MQA is uh" + }, + { + "start": 5098.4, + "duration": 0.0, + "text": "Yeah, MQA is uh uh" + }, + { + "start": 5099.19, + "duration": 0.0, + "text": "uh" + }, + { + "start": 5099.2, + "duration": 0.0, + "text": "uh it's<01:24:59.320> not<01:24:59.480> just<01:24:59.720> inference<01:25:00.160> time<01:25:00.480> fixed.<01:25:01.160> It's" + }, + { + "start": 5101.27, + "duration": 0.0, + "text": "it's not just inference time fixed. It's" + }, + { + "start": 5101.28, + "duration": 0.0, + "text": "it's not just inference time fixed. It's a<01:25:01.360> pleasure<01:25:01.840> to<01:25:01.920> train.<01:25:02.400> That's<01:25:02.560> right,<01:25:02.720> yeah." + }, + { + "start": 5103.03, + "duration": 0.0, + "text": "a pleasure to train. That's right, yeah." + }, + { + "start": 5103.04, + "duration": 0.0, + "text": "a pleasure to train. That's right, yeah. You<01:25:03.160> you<01:25:03.280> you<01:25:03.480> train<01:25:03.960> with<01:25:04.120> a<01:25:04.200> certain<01:25:04.440> number" + }, + { + "start": 5104.71, + "duration": 0.0, + "text": "You you you train with a certain number" + }, + { + "start": 5104.72, + "duration": 0.0, + "text": "You you you train with a certain number of<01:25:04.840> keys." + }, + { + "start": 5108.36, + "duration": 0.0, + "text": "Okay.<01:25:08.880> The<01:25:09.000> last<01:25:09.280> thing<01:25:09.440> I'll<01:25:09.560> talk<01:25:09.800> about<01:25:10.400> is" + }, + { + "start": 5110.99, + "duration": 0.0, + "text": "Okay. The last thing I'll talk about is" + }, + { + "start": 5111.0, + "duration": 0.0, + "text": "Okay. The last thing I'll talk about is sliding<01:25:11.520> window<01:25:11.760> attention,<01:25:12.240> which<01:25:12.400> is<01:25:12.480> a" + }, + { + "start": 5112.55, + "duration": 0.0, + "text": "sliding window attention, which is a" + }, + { + "start": 5112.56, + "duration": 0.0, + "text": "sliding window attention, which is a really<01:25:13.040> old<01:25:13.360> idea.<01:25:13.880> Like,<01:25:14.000> you<01:25:14.080> know,<01:25:14.200> GPT-3" + }, + { + "start": 5115.55, + "duration": 0.0, + "text": "really old idea. Like, you know, GPT-3" + }, + { + "start": 5115.56, + "duration": 0.0, + "text": "really old idea. Like, you know, GPT-3 used<01:25:16.000> actually<01:25:16.440> this<01:25:17.200> you<01:25:17.320> know,<01:25:17.400> if<01:25:17.480> you<01:25:17.600> read" + }, + { + "start": 5117.71, + "duration": 0.0, + "text": "used actually this you know, if you read" + }, + { + "start": 5117.72, + "duration": 0.0, + "text": "used actually this you know, if you read the<01:25:17.800> paper,<01:25:18.120> they'll<01:25:18.280> say<01:25:18.400> we<01:25:18.600> alternate" + }, + { + "start": 5119.27, + "duration": 0.0, + "text": "the paper, they'll say we alternate" + }, + { + "start": 5119.28, + "duration": 0.0, + "text": "the paper, they'll say we alternate between<01:25:20.160> full<01:25:20.440> attention,<01:25:20.960> which<01:25:21.440> where" + }, + { + "start": 5121.63, + "duration": 0.0, + "text": "between full attention, which where" + }, + { + "start": 5121.64, + "duration": 0.0, + "text": "between full attention, which where every<01:25:21.880> position<01:25:22.280> can<01:25:22.400> attend<01:25:22.600> to<01:25:22.680> everyone<01:25:22.960> in" + }, + { + "start": 5123.03, + "duration": 0.0, + "text": "every position can attend to everyone in" + }, + { + "start": 5123.04, + "duration": 0.0, + "text": "every position can attend to everyone in the<01:25:23.120> past,<01:25:23.880> and<01:25:24.000> a<01:25:24.080> banded<01:25:24.560> matrix<01:25:25.600> style" + }, + { + "start": 5125.87, + "duration": 0.0, + "text": "the past, and a banded matrix style" + }, + { + "start": 5125.88, + "duration": 0.0, + "text": "the past, and a banded matrix style attention<01:25:26.320> where<01:25:26.440> you<01:25:26.520> can<01:25:26.960> attend<01:25:27.280> to" + }, + { + "start": 5127.35, + "duration": 0.0, + "text": "attention where you can attend to" + }, + { + "start": 5127.36, + "duration": 0.0, + "text": "attention where you can attend to everyone<01:25:28.080> within<01:25:28.280> a<01:25:28.320> fixed<01:25:28.640> window." + }, + { + "start": 5130.27, + "duration": 0.0, + "text": "everyone within a fixed window." + }, + { + "start": 5130.28, + "duration": 0.0, + "text": "everyone within a fixed window. And<01:25:30.640> you<01:25:30.720> know,<01:25:30.880> OpenAI<01:25:31.280> has<01:25:31.360> some<01:25:31.520> early<01:25:31.840> work" + }, + { + "start": 5132.27, + "duration": 0.0, + "text": "And you know, OpenAI has some early work" + }, + { + "start": 5132.28, + "duration": 0.0, + "text": "And you know, OpenAI has some early work on<01:25:32.480> these<01:25:32.680> kinds<01:25:33.040> of<01:25:33.120> like<01:25:33.360> different<01:25:33.840> kinds" + }, + { + "start": 5134.07, + "duration": 0.0, + "text": "on these kinds of like different kinds" + }, + { + "start": 5134.08, + "duration": 0.0, + "text": "on these kinds of like different kinds of<01:25:34.160> attention<01:25:34.560> patterns<01:25:35.480> that<01:25:35.600> you<01:25:35.680> can<01:25:35.880> use." + }, + { + "start": 5136.67, + "duration": 0.0, + "text": "of attention patterns that you can use." + }, + { + "start": 5136.68, + "duration": 0.0, + "text": "of attention patterns that you can use. But<01:25:37.280> actually,<01:25:37.520> this<01:25:37.640> has<01:25:37.760> become<01:25:38.000> really" + }, + { + "start": 5138.27, + "duration": 0.0, + "text": "But actually, this has become really" + }, + { + "start": 5138.28, + "duration": 0.0, + "text": "But actually, this has become really really<01:25:38.560> popular<01:25:39.400> over<01:25:39.680> the<01:25:39.800> past<01:25:40.160> year." + }, + { + "start": 5141.55, + "duration": 0.0, + "text": "really popular over the past year." + }, + { + "start": 5141.56, + "duration": 0.0, + "text": "really popular over the past year. This<01:25:41.720> idea<01:25:42.320> of<01:25:42.840> alternating,<01:25:43.960> you<01:25:44.040> know,<01:25:44.200> the" + }, + { + "start": 5144.39, + "duration": 0.0, + "text": "This idea of alternating, you know, the" + }, + { + "start": 5144.4, + "duration": 0.0, + "text": "This idea of alternating, you know, the big<01:25:44.680> full<01:25:44.840> attention<01:25:45.760> and<01:25:46.240> a<01:25:46.440> more<01:25:46.720> local" + }, + { + "start": 5147.15, + "duration": 0.0, + "text": "big full attention and a more local" + }, + { + "start": 5147.16, + "duration": 0.0, + "text": "big full attention and a more local attention<01:25:47.680> actually<01:25:48.080> hits<01:25:48.280> a<01:25:48.360> sweet<01:25:48.720> spot<01:25:49.600> for" + }, + { + "start": 5150.03, + "duration": 0.0, + "text": "attention actually hits a sweet spot for" + }, + { + "start": 5150.04, + "duration": 0.0, + "text": "attention actually hits a sweet spot for how<01:25:50.160> to<01:25:50.600> manage<01:25:51.120> like<01:25:51.320> long<01:25:51.520> context" + }, + { + "start": 5151.91, + "duration": 0.0, + "text": "how to manage like long context" + }, + { + "start": 5151.92, + "duration": 0.0, + "text": "how to manage like long context performance<01:25:52.480> while<01:25:52.640> not<01:25:52.840> paying<01:25:53.720> too<01:25:53.880> much" + }, + { + "start": 5154.47, + "duration": 0.0, + "text": "performance while not paying too much" + }, + { + "start": 5154.48, + "duration": 0.0, + "text": "performance while not paying too much for<01:25:54.640> inference." + }, + { + "start": 5156.31, + "duration": 0.0, + "text": "for inference." + }, + { + "start": 5156.32, + "duration": 0.0, + "text": "for inference. Um<01:25:57.120> I<01:25:57.240> think,<01:25:57.800> you<01:25:57.880> know,<01:25:58.640> the<01:25:58.800> more<01:25:59.080> recent" + }, + { + "start": 5159.59, + "duration": 0.0, + "text": "Um I think, you know, the more recent" + }, + { + "start": 5159.6, + "duration": 0.0, + "text": "Um I think, you know, the more recent revival<01:26:00.200> in<01:26:00.320> open<01:26:00.560> models,<01:26:01.000> I<01:26:01.080> would<01:26:01.240> maybe" + }, + { + "start": 5161.55, + "duration": 0.0, + "text": "revival in open models, I would maybe" + }, + { + "start": 5161.56, + "duration": 0.0, + "text": "revival in open models, I would maybe say<01:26:02.440> Cohere<01:26:02.800> Command<01:26:03.240> A<01:26:03.400> was<01:26:03.560> the<01:26:03.640> first<01:26:03.920> one<01:26:04.040> I" + }, + { + "start": 5164.11, + "duration": 0.0, + "text": "say Cohere Command A was the first one I" + }, + { + "start": 5164.12, + "duration": 0.0, + "text": "say Cohere Command A was the first one I saw<01:26:04.360> do<01:26:04.600> it,<01:26:05.160> where<01:26:05.840> they,<01:26:06.560> you<01:26:06.640> know,<01:26:06.760> had" + }, + { + "start": 5166.99, + "duration": 0.0, + "text": "saw do it, where they, you know, had" + }, + { + "start": 5167.0, + "duration": 0.0, + "text": "saw do it, where they, you know, had this<01:26:07.160> like<01:26:07.360> structure<01:26:07.920> where<01:26:08.560> every<01:26:09.000> four" + }, + { + "start": 5169.35, + "duration": 0.0, + "text": "this like structure where every four" + }, + { + "start": 5169.36, + "duration": 0.0, + "text": "this like structure where every four layers<01:26:09.800> they<01:26:09.920> would<01:26:10.000> have<01:26:10.120> a<01:26:10.200> full<01:26:10.400> attention" + }, + { + "start": 5170.79, + "duration": 0.0, + "text": "layers they would have a full attention" + }, + { + "start": 5170.8, + "duration": 0.0, + "text": "layers they would have a full attention that<01:26:10.920> attended<01:26:11.280> to<01:26:11.360> everything.<01:26:12.440> The<01:26:12.560> three" + }, + { + "start": 5172.83, + "duration": 0.0, + "text": "that attended to everything. The three" + }, + { + "start": 5172.84, + "duration": 0.0, + "text": "that attended to everything. The three layers<01:26:13.120> in<01:26:13.200> between<01:26:13.680> would<01:26:13.800> use<01:26:13.920> a<01:26:14.000> sliding" + }, + { + "start": 5174.43, + "duration": 0.0, + "text": "layers in between would use a sliding" + }, + { + "start": 5174.44, + "duration": 0.0, + "text": "layers in between would use a sliding window<01:26:14.720> attention<01:26:15.560> that<01:26:15.720> would<01:26:15.840> only<01:26:16.080> be<01:26:16.200> able" + }, + { + "start": 5176.35, + "duration": 0.0, + "text": "window attention that would only be able" + }, + { + "start": 5176.36, + "duration": 0.0, + "text": "window attention that would only be able to<01:26:16.440> look<01:26:16.640> at<01:26:16.760> local<01:26:17.080> structure.<01:26:17.520> And<01:26:17.640> of" + }, + { + "start": 5177.71, + "duration": 0.0, + "text": "to look at local structure. And of" + }, + { + "start": 5177.72, + "duration": 0.0, + "text": "to look at local structure. And of course,<01:26:18.280> you<01:26:18.400> know,<01:26:18.520> as<01:26:18.640> you<01:26:18.760> go<01:26:19.080> up<01:26:20.080> sorry,<01:26:20.360> in" + }, + { + "start": 5180.47, + "duration": 0.0, + "text": "course, you know, as you go up sorry, in" + }, + { + "start": 5180.48, + "duration": 0.0, + "text": "course, you know, as you go up sorry, in this<01:26:20.600> case<01:26:20.840> down<01:26:21.200> cuz<01:26:21.320> they<01:26:21.440> ordered<01:26:21.760> the<01:26:22.000> the" + }, + { + "start": 5182.71, + "duration": 0.0, + "text": "this case down cuz they ordered the the" + }, + { + "start": 5182.72, + "duration": 0.0, + "text": "this case down cuz they ordered the the diagram<01:26:23.120> the<01:26:23.200> other<01:26:23.320> way.<01:26:23.720> As<01:26:23.840> you<01:26:23.880> go<01:26:24.040> down" + }, + { + "start": 5184.55, + "duration": 0.0, + "text": "diagram the other way. As you go down" + }, + { + "start": 5184.56, + "duration": 0.0, + "text": "diagram the other way. As you go down sort<01:26:24.720> of<01:26:24.840> these<01:26:25.080> blocks,<01:26:26.000> you<01:26:26.080> know,<01:26:26.200> you're" + }, + { + "start": 5186.39, + "duration": 0.0, + "text": "sort of these blocks, you know, you're" + }, + { + "start": 5186.4, + "duration": 0.0, + "text": "sort of these blocks, you know, you're aggregating<01:26:26.920> local<01:26:27.200> information<01:26:27.640> into" + }, + { + "start": 5187.79, + "duration": 0.0, + "text": "aggregating local information into" + }, + { + "start": 5187.8, + "duration": 0.0, + "text": "aggregating local information into global<01:26:28.200> ones.<01:26:28.440> The<01:26:28.560> local<01:26:28.960> attentions<01:26:29.440> at<01:26:29.560> the" + }, + { + "start": 5189.67, + "duration": 0.0, + "text": "global ones. The local attentions at the" + }, + { + "start": 5189.68, + "duration": 0.0, + "text": "global ones. The local attentions at the end<01:26:29.880> can<01:26:30.000> of<01:26:30.120> course<01:26:30.440> access<01:26:30.960> more<01:26:31.120> global" + }, + { + "start": 5191.43, + "duration": 0.0, + "text": "end can of course access more global" + }, + { + "start": 5191.44, + "duration": 0.0, + "text": "end can of course access more global information,<01:26:32.440> but<01:26:32.560> this,<01:26:32.760> you<01:26:32.840> know,<01:26:32.960> allows" + }, + { + "start": 5193.35, + "duration": 0.0, + "text": "information, but this, you know, allows" + }, + { + "start": 5193.36, + "duration": 0.0, + "text": "information, but this, you know, allows you<01:26:33.440> to<01:26:33.560> manage<01:26:34.520> the" + }, + { + "start": 5195.75, + "duration": 0.0, + "text": "you to manage the" + }, + { + "start": 5195.76, + "duration": 0.0, + "text": "you to manage the the<01:26:36.400> the<01:26:36.520> cost<01:26:37.520> of<01:26:37.680> having<01:26:37.920> a<01:26:38.000> really<01:26:38.320> long" + }, + { + "start": 5198.55, + "duration": 0.0, + "text": "the the cost of having a really long" + }, + { + "start": 5198.56, + "duration": 0.0, + "text": "the the cost of having a really long context<01:26:39.360> without<01:26:39.720> having<01:26:40.000> to<01:26:40.120> go<01:26:40.360> for" + }, + { + "start": 5200.55, + "duration": 0.0, + "text": "context without having to go for" + }, + { + "start": 5200.56, + "duration": 0.0, + "text": "context without having to go for something<01:26:40.920> like<01:26:41.800> a<01:26:41.880> state-space<01:26:42.400> model<01:26:42.720> or" + }, + { + "start": 5202.87, + "duration": 0.0, + "text": "something like a state-space model or" + }, + { + "start": 5202.88, + "duration": 0.0, + "text": "something like a state-space model or more<01:26:43.040> exotic<01:26:43.560> intervention.<01:26:44.080> And<01:26:44.200> that's" + }, + { + "start": 5204.39, + "duration": 0.0, + "text": "more exotic intervention. And that's" + }, + { + "start": 5204.4, + "duration": 0.0, + "text": "more exotic intervention. And that's worked<01:26:44.640> quite<01:26:44.840> well." + }, + { + "start": 5205.99, + "duration": 0.0, + "text": "worked quite well." + }, + { + "start": 5206.0, + "duration": 0.0, + "text": "worked quite well. Um<01:26:46.600> there's<01:26:46.840> also<01:26:47.120> some<01:26:47.280> innovation<01:26:47.880> where" + }, + { + "start": 5209.07, + "duration": 0.0, + "text": "Um there's also some innovation where" + }, + { + "start": 5209.08, + "duration": 0.0, + "text": "Um there's also some innovation where people<01:26:49.400> change<01:26:49.800> the<01:26:50.200> embedding<01:26:50.640> format<01:26:51.320> for" + }, + { + "start": 5211.67, + "duration": 0.0, + "text": "people change the embedding format for" + }, + { + "start": 5211.68, + "duration": 0.0, + "text": "people change the embedding format for the<01:26:51.800> long<01:26:52.120> range<01:26:52.360> information<01:26:52.880> where<01:26:53.000> they" + }, + { + "start": 5213.11, + "duration": 0.0, + "text": "the long range information where they" + }, + { + "start": 5213.12, + "duration": 0.0, + "text": "the long range information where they get<01:26:53.360> rid<01:26:53.480> of<01:26:53.560> things<01:26:53.840> like<01:26:54.520> rope,<01:26:54.880> so<01:26:55.040> you<01:26:55.160> have" + }, + { + "start": 5215.35, + "duration": 0.0, + "text": "get rid of things like rope, so you have" + }, + { + "start": 5215.36, + "duration": 0.0, + "text": "get rid of things like rope, so you have no<01:26:55.600> position<01:26:56.000> embeddings<01:26:56.360> at<01:26:56.480> all.<01:26:57.000> So," + }, + { + "start": 5217.11, + "duration": 0.0, + "text": "no position embeddings at all. So," + }, + { + "start": 5217.12, + "duration": 0.0, + "text": "no position embeddings at all. So, you're<01:26:57.240> really<01:26:57.440> looking<01:26:57.720> almost<01:26:58.040> at<01:26:58.200> bags" + }, + { + "start": 5219.55, + "duration": 0.0, + "text": "you're really looking almost at bags" + }, + { + "start": 5219.56, + "duration": 0.0, + "text": "you're really looking almost at bags where<01:26:59.720> the<01:26:59.840> short<01:27:00.120> range<01:27:00.320> information<01:27:00.840> still" + }, + { + "start": 5221.23, + "duration": 0.0, + "text": "where the short range information still" + }, + { + "start": 5221.24, + "duration": 0.0, + "text": "where the short range information still gets<01:27:01.560> position<01:27:01.920> information.<01:27:02.720> So,<01:27:02.840> people<01:27:03.160> do" + }, + { + "start": 5223.51, + "duration": 0.0, + "text": "gets position information. So, people do" + }, + { + "start": 5223.52, + "duration": 0.0, + "text": "gets position information. So, people do all<01:27:03.680> sorts<01:27:03.920> of<01:27:04.680> you<01:27:04.760> know," + }, + { + "start": 5225.55, + "duration": 0.0, + "text": "all sorts of you know," + }, + { + "start": 5225.56, + "duration": 0.0, + "text": "all sorts of you know, kinds<01:27:05.920> of<01:27:06.000> interventions<01:27:06.680> involving<01:27:07.120> these" + }, + { + "start": 5227.59, + "duration": 0.0, + "text": "kinds of interventions involving these" + }, + { + "start": 5227.6, + "duration": 0.0, + "text": "kinds of interventions involving these these" + }, + { + "start": 5228.39, + "duration": 0.0, + "text": "these" + }, + { + "start": 5228.4, + "duration": 0.0, + "text": "these both<01:27:08.640> the<01:27:08.720> embeddings<01:27:09.600> and<01:27:09.920> alternating" + }, + { + "start": 5230.55, + "duration": 0.0, + "text": "both the embeddings and alternating" + }, + { + "start": 5230.56, + "duration": 0.0, + "text": "both the embeddings and alternating local<01:27:10.920> and<01:27:11.040> global<01:27:11.440> structure." + }, + { + "start": 5232.83, + "duration": 0.0, + "text": "local and global structure." + }, + { + "start": 5232.84, + "duration": 0.0, + "text": "local and global structure. Um<01:27:13.040> I'll<01:27:13.200> say<01:27:13.360> that<01:27:13.520> this<01:27:13.720> is<01:27:13.880> a" + }, + { + "start": 5234.79, + "duration": 0.0, + "text": "Um I'll say that this is a" + }, + { + "start": 5234.8, + "duration": 0.0, + "text": "Um I'll say that this is a you<01:27:14.880> know,<01:27:15.000> attention<01:27:15.480> and<01:27:15.600> in<01:27:15.680> general<01:27:16.200> how" + }, + { + "start": 5236.35, + "duration": 0.0, + "text": "you know, attention and in general how" + }, + { + "start": 5236.36, + "duration": 0.0, + "text": "you know, attention and in general how to<01:27:16.480> manage<01:27:16.880> the<01:27:17.000> trade-off<01:27:17.360> between<01:27:18.320> long" + }, + { + "start": 5238.59, + "duration": 0.0, + "text": "to manage the trade-off between long" + }, + { + "start": 5238.6, + "duration": 0.0, + "text": "to manage the trade-off between long context<01:27:19.240> and<01:27:19.400> sort<01:27:19.520> of<01:27:20.200> long<01:27:20.440> context<01:27:20.840> cost" + }, + { + "start": 5241.23, + "duration": 0.0, + "text": "context and sort of long context cost" + }, + { + "start": 5241.24, + "duration": 0.0, + "text": "context and sort of long context cost and<01:27:21.400> performance<01:27:22.240> is<01:27:22.440> a<01:27:22.720> still<01:27:22.920> an<01:27:23.040> active" + }, + { + "start": 5243.39, + "duration": 0.0, + "text": "and performance is a still an active" + }, + { + "start": 5243.4, + "duration": 0.0, + "text": "and performance is a still an active area<01:27:23.640> of<01:27:23.720> investigation.<01:27:24.400> It's<01:27:24.520> a<01:27:24.600> place" + }, + { + "start": 5244.87, + "duration": 0.0, + "text": "area of investigation. It's a place" + }, + { + "start": 5244.88, + "duration": 0.0, + "text": "area of investigation. It's a place where<01:27:25.280> the<01:27:25.400> most<01:27:25.800> architecture<01:27:26.520> work<01:27:27.080> and" + }, + { + "start": 5247.19, + "duration": 0.0, + "text": "where the most architecture work and" + }, + { + "start": 5247.2, + "duration": 0.0, + "text": "where the most architecture work and changes<01:27:27.640> are<01:27:27.720> still<01:27:27.960> being<01:27:28.160> done.<01:27:29.000> Um<01:27:29.360> we<01:27:29.520> see" + }, + { + "start": 5250.23, + "duration": 0.0, + "text": "changes are still being done. Um we see" + }, + { + "start": 5250.24, + "duration": 0.0, + "text": "changes are still being done. Um we see essentially<01:27:30.680> a<01:27:30.920> bunch<01:27:31.200> of<01:27:31.280> other<01:27:31.480> models" + }, + { + "start": 5251.79, + "duration": 0.0, + "text": "essentially a bunch of other models" + }, + { + "start": 5251.8, + "duration": 0.0, + "text": "essentially a bunch of other models adopt<01:27:32.160> this<01:27:32.320> idea.<01:27:32.600> Llama<01:27:32.920> 4,<01:27:33.680> most<01:27:33.960> recently" + }, + { + "start": 5254.23, + "duration": 0.0, + "text": "adopt this idea. Llama 4, most recently" + }, + { + "start": 5254.24, + "duration": 0.0, + "text": "adopt this idea. Llama 4, most recently Gemma<01:27:34.520> 4,<01:27:34.800> Omo<01:27:35.040> 3,<01:27:35.720> they<01:27:35.920> all<01:27:36.160> do<01:27:36.920> this" + }, + { + "start": 5257.11, + "duration": 0.0, + "text": "Gemma 4, Omo 3, they all do this" + }, + { + "start": 5257.12, + "duration": 0.0, + "text": "Gemma 4, Omo 3, they all do this combination<01:27:37.640> of<01:27:37.760> sliding<01:27:38.160> window<01:27:38.400> attention" + }, + { + "start": 5258.83, + "duration": 0.0, + "text": "combination of sliding window attention" + }, + { + "start": 5258.84, + "duration": 0.0, + "text": "combination of sliding window attention and<01:27:38.960> full<01:27:39.120> attention,<01:27:40.040> in<01:27:40.200> their<01:27:40.400> case<01:27:40.640> using" + }, + { + "start": 5260.91, + "duration": 0.0, + "text": "and full attention, in their case using" + }, + { + "start": 5260.92, + "duration": 0.0, + "text": "and full attention, in their case using full<01:27:41.160> rope<01:27:41.440> instead<01:27:41.760> of<01:27:42.160> nope<01:27:42.880> as<01:27:43.160> the" + }, + { + "start": 5264.03, + "duration": 0.0, + "text": "full rope instead of nope as the" + }, + { + "start": 5264.04, + "duration": 0.0, + "text": "full rope instead of nope as the embedding." + }, + { + "start": 5265.39, + "duration": 0.0, + "text": "embedding." + }, + { + "start": 5265.4, + "duration": 0.0, + "text": "embedding. So,<01:27:45.480> as<01:27:45.600> I<01:27:45.680> said,<01:27:46.520> this<01:27:46.680> is<01:27:46.760> becoming<01:27:47.080> really" + }, + { + "start": 5267.31, + "duration": 0.0, + "text": "So, as I said, this is becoming really" + }, + { + "start": 5267.32, + "duration": 0.0, + "text": "So, as I said, this is becoming really really<01:27:47.560> popular." + }, + { + "start": 5268.91, + "duration": 0.0, + "text": "really popular." + }, + { + "start": 5268.92, + "duration": 0.0, + "text": "really popular. Qwen<01:27:49.240> 3.5,<01:27:49.840> which<01:27:49.960> I<01:27:50.040> put<01:27:50.280> on<01:27:50.360> the<01:27:50.480> right," + }, + { + "start": 5271.15, + "duration": 0.0, + "text": "Qwen 3.5, which I put on the right," + }, + { + "start": 5271.16, + "duration": 0.0, + "text": "Qwen 3.5, which I put on the right, they're<01:27:51.400> actually<01:27:51.680> a<01:27:51.720> little<01:27:51.880> bit<01:27:52.040> different" + }, + { + "start": 5272.43, + "duration": 0.0, + "text": "they're actually a little bit different" + }, + { + "start": 5272.44, + "duration": 0.0, + "text": "they're actually a little bit different because<01:27:52.680> they<01:27:52.840> alternate<01:27:53.640> a<01:27:53.719> state-space" + }, + { + "start": 5274.43, + "duration": 0.0, + "text": "because they alternate a state-space" + }, + { + "start": 5274.44, + "duration": 0.0, + "text": "because they alternate a state-space model" + }, + { + "start": 5275.67, + "duration": 0.0, + "text": "model" + }, + { + "start": 5275.68, + "duration": 0.0, + "text": "model called<01:27:56.000> a<01:27:56.120> gated<01:27:56.440> DeltaNet<01:27:57.200> and<01:27:57.600> a<01:27:57.680> full" + }, + { + "start": 5277.91, + "duration": 0.0, + "text": "called a gated DeltaNet and a full" + }, + { + "start": 5277.92, + "duration": 0.0, + "text": "called a gated DeltaNet and a full attention<01:27:58.920> every<01:27:59.600> sort<01:27:59.840> of,<01:28:00.160> you<01:28:00.240> know,<01:28:00.400> one" + }, + { + "start": 5281.51, + "duration": 0.0, + "text": "attention every sort of, you know, one" + }, + { + "start": 5281.52, + "duration": 0.0, + "text": "attention every sort of, you know, one one<01:28:01.719> full<01:28:01.880> attention<01:28:02.160> every<01:28:02.360> four<01:28:02.600> layer" + }, + { + "start": 5282.87, + "duration": 0.0, + "text": "one full attention every four layer" + }, + { + "start": 5282.88, + "duration": 0.0, + "text": "one full attention every four layer every<01:28:03.240> four<01:28:03.440> layers." + }, + { + "start": 5284.59, + "duration": 0.0, + "text": "every four layers." + }, + { + "start": 5284.6, + "duration": 0.0, + "text": "every four layers. So,<01:28:04.680> it's<01:28:04.800> the<01:28:04.880> same<01:28:05.200> alternating<01:28:05.760> structure," + }, + { + "start": 5286.43, + "duration": 0.0, + "text": "So, it's the same alternating structure," + }, + { + "start": 5286.44, + "duration": 0.0, + "text": "So, it's the same alternating structure, but<01:28:06.560> they're<01:28:06.680> using<01:28:06.880> a<01:28:06.960> different<01:28:07.719> sort<01:28:07.880> of" + }, + { + "start": 5287.99, + "duration": 0.0, + "text": "but they're using a different sort of" + }, + { + "start": 5288.0, + "duration": 0.0, + "text": "but they're using a different sort of cheap<01:28:08.360> layer.<01:28:08.680> In<01:28:08.760> their<01:28:08.960> case,<01:28:09.200> they're" + }, + { + "start": 5289.27, + "duration": 0.0, + "text": "cheap layer. In their case, they're" + }, + { + "start": 5289.28, + "duration": 0.0, + "text": "cheap layer. In their case, they're using<01:28:09.480> a<01:28:09.520> state-space<01:28:10.200> model.<01:28:10.560> I'll<01:28:10.640> explain" + }, + { + "start": 5290.95, + "duration": 0.0, + "text": "using a state-space model. I'll explain" + }, + { + "start": 5290.96, + "duration": 0.0, + "text": "using a state-space model. I'll explain what<01:28:11.080> that<01:28:11.240> is<01:28:11.760> next<01:28:12.040> lecture<01:28:12.640> instead<01:28:13.160> of<01:28:13.240> a" + }, + { + "start": 5293.31, + "duration": 0.0, + "text": "what that is next lecture instead of a" + }, + { + "start": 5293.32, + "duration": 0.0, + "text": "what that is next lecture instead of a sliding<01:28:13.760> window<01:28:14.120> sort<01:28:14.240> of<01:28:14.400> local<01:28:14.800> attention." + }, + { + "start": 5295.75, + "duration": 0.0, + "text": "sliding window sort of local attention." + }, + { + "start": 5295.76, + "duration": 0.0, + "text": "sliding window sort of local attention. But<01:28:15.840> you<01:28:15.920> see<01:28:16.080> this<01:28:16.240> is<01:28:16.360> like<01:28:16.520> I<01:28:16.600> think<01:28:17.120> a<01:28:17.160> new" + }, + { + "start": 5297.43, + "duration": 0.0, + "text": "But you see this is like I think a new" + }, + { + "start": 5297.44, + "duration": 0.0, + "text": "But you see this is like I think a new theme<01:28:17.760> over<01:28:17.920> the<01:28:18.040> past<01:28:18.360> year<01:28:19.080> where,<01:28:19.560> you" + }, + { + "start": 5299.63, + "duration": 0.0, + "text": "theme over the past year where, you" + }, + { + "start": 5299.64, + "duration": 0.0, + "text": "theme over the past year where, you know,<01:28:19.760> open<01:28:20.080> models<01:28:20.520> are<01:28:20.640> really<01:28:20.840> trying<01:28:21.040> to" + }, + { + "start": 5301.11, + "duration": 0.0, + "text": "know, open models are really trying to" + }, + { + "start": 5301.12, + "duration": 0.0, + "text": "know, open models are really trying to grapple<01:28:21.480> with<01:28:21.640> long<01:28:21.880> context<01:28:22.280> performance," + }, + { + "start": 5303.23, + "duration": 0.0, + "text": "grapple with long context performance," + }, + { + "start": 5303.24, + "duration": 0.0, + "text": "grapple with long context performance, and<01:28:23.440> the<01:28:23.560> way<01:28:23.719> to<01:28:23.840> do<01:28:24.000> that,<01:28:24.440> at<01:28:24.520> least<01:28:24.719> so<01:28:24.880> far," + }, + { + "start": 5305.63, + "duration": 0.0, + "text": "and the way to do that, at least so far," + }, + { + "start": 5305.64, + "duration": 0.0, + "text": "and the way to do that, at least so far, is<01:28:25.840> to<01:28:25.960> have<01:28:26.200> these<01:28:26.400> hybrid<01:28:26.800> models<01:28:27.480> that" + }, + { + "start": 5307.63, + "duration": 0.0, + "text": "is to have these hybrid models that" + }, + { + "start": 5307.64, + "duration": 0.0, + "text": "is to have these hybrid models that aren't<01:28:27.840> just<01:28:28.080> global<01:28:28.360> attention,<01:28:29.080> aren't" + }, + { + "start": 5309.35, + "duration": 0.0, + "text": "aren't just global attention, aren't" + }, + { + "start": 5309.36, + "duration": 0.0, + "text": "aren't just global attention, aren't just<01:28:29.640> cheap<01:28:29.880> attention.<01:28:30.280> They're<01:28:30.400> some<01:28:30.560> sort" + }, + { + "start": 5310.709, + "duration": 0.0, + "text": "just cheap attention. They're some sort" + }, + { + "start": 5310.719, + "duration": 0.0, + "text": "just cheap attention. They're some sort of<01:28:30.800> mix<01:28:31.120> in<01:28:31.240> between.<01:28:31.600> And<01:28:31.719> that's<01:28:32.360> that<01:28:32.520> seems" + }, + { + "start": 5312.83, + "duration": 0.0, + "text": "of mix in between. And that's that seems" + }, + { + "start": 5312.84, + "duration": 0.0, + "text": "of mix in between. And that's that seems to<01:28:32.920> have<01:28:33.080> worked<01:28:33.360> very<01:28:33.640> well<01:28:33.840> so<01:28:34.000> far<01:28:34.600> in<01:28:34.719> a<01:28:34.800> lot" + }, + { + "start": 5314.99, + "duration": 0.0, + "text": "to have worked very well so far in a lot" + }, + { + "start": 5315.0, + "duration": 0.0, + "text": "to have worked very well so far in a lot of<01:28:35.080> these<01:28:35.240> models." + }, + { + "start": 5317.75, + "duration": 0.0, + "text": "of these models." + }, + { + "start": 5317.76, + "duration": 0.0, + "text": "of these models. Okay,<01:28:38.280> cool.<01:28:39.000> So,<01:28:39.480> as<01:28:39.640> I<01:28:39.680> was<01:28:39.840> sort<01:28:40.000> of<01:28:40.080> trying" + }, + { + "start": 5320.27, + "duration": 0.0, + "text": "Okay, cool. So, as I was sort of trying" + }, + { + "start": 5320.28, + "duration": 0.0, + "text": "Okay, cool. So, as I was sort of trying to<01:28:40.360> emphasize,<01:28:41.200> when<01:28:41.320> you<01:28:41.440> look<01:28:41.640> across<01:28:42.400> all" + }, + { + "start": 5322.55, + "duration": 0.0, + "text": "to emphasize, when you look across all" + }, + { + "start": 5322.56, + "duration": 0.0, + "text": "to emphasize, when you look across all of<01:28:42.640> these<01:28:42.840> models,<01:28:43.240> you<01:28:43.320> start<01:28:43.640> to<01:28:43.719> see<01:28:43.880> a<01:28:43.960> lot" + }, + { + "start": 5324.15, + "duration": 0.0, + "text": "of these models, you start to see a lot" + }, + { + "start": 5324.16, + "duration": 0.0, + "text": "of these models, you start to see a lot of<01:28:44.280> patterns<01:28:44.719> and<01:28:44.840> hopefully<01:28:45.240> a<01:28:45.280> sense<01:28:45.600> of" + }, + { + "start": 5326.07, + "duration": 0.0, + "text": "of patterns and hopefully a sense of" + }, + { + "start": 5326.08, + "duration": 0.0, + "text": "of patterns and hopefully a sense of general<01:28:46.560> understanding<01:28:47.240> about<01:28:47.480> what<01:28:47.960> things" + }, + { + "start": 5328.27, + "duration": 0.0, + "text": "general understanding about what things" + }, + { + "start": 5328.28, + "duration": 0.0, + "text": "general understanding about what things you<01:28:48.480> can<01:28:48.840> do<01:28:49.000> and<01:28:49.120> what<01:28:49.240> things<01:28:49.480> are<01:28:49.560> good<01:28:49.719> to" + }, + { + "start": 5329.83, + "duration": 0.0, + "text": "you can do and what things are good to" + }, + { + "start": 5329.84, + "duration": 0.0, + "text": "you can do and what things are good to folks.<01:28:50.800> Um<01:28:51.080> we<01:28:51.280> also<01:28:51.560> see<01:28:51.760> a<01:28:51.840> lot<01:28:52.080> of" + }, + { + "start": 5332.19, + "duration": 0.0, + "text": "folks. Um we also see a lot of" + }, + { + "start": 5332.2, + "duration": 0.0, + "text": "folks. Um we also see a lot of differences<01:28:52.840> in<01:28:53.000> how<01:28:53.160> we<01:28:53.320> handle<01:28:53.680> context<01:28:54.200> and" + }, + { + "start": 5334.31, + "duration": 0.0, + "text": "differences in how we handle context and" + }, + { + "start": 5334.32, + "duration": 0.0, + "text": "differences in how we handle context and how<01:28:54.400> we<01:28:54.520> handle<01:28:54.800> position<01:28:55.200> embeddings." + }, + { + "start": 5336.55, + "duration": 0.0, + "text": "how we handle position embeddings." + }, + { + "start": 5336.56, + "duration": 0.0, + "text": "how we handle position embeddings. Even<01:28:56.800> tokenization,<01:28:57.440> there's<01:28:57.600> some" + }, + { + "start": 5337.709, + "duration": 0.0, + "text": "Even tokenization, there's some" + }, + { + "start": 5337.719, + "duration": 0.0, + "text": "Even tokenization, there's some differences,<01:28:58.280> right?<01:28:58.840> So,<01:28:58.960> there<01:28:59.160> are" + }, + { + "start": 5339.31, + "duration": 0.0, + "text": "differences, right? So, there are" + }, + { + "start": 5339.32, + "duration": 0.0, + "text": "differences, right? So, there are differences<01:28:59.760> across<01:29:00.040> these<01:29:00.200> models,<01:29:00.520> but" + }, + { + "start": 5340.63, + "duration": 0.0, + "text": "differences across these models, but" + }, + { + "start": 5340.64, + "duration": 0.0, + "text": "differences across these models, but there's<01:29:00.800> also<01:29:01.000> commonalities<01:29:01.719> that" + }, + { + "start": 5341.87, + "duration": 0.0, + "text": "there's also commonalities that" + }, + { + "start": 5341.88, + "duration": 0.0, + "text": "there's also commonalities that hopefully<01:29:02.240> now<01:29:02.480> give<01:29:02.680> you<01:29:02.760> some<01:29:02.880> intuition<01:29:03.719> as" + }, + { + "start": 5343.87, + "duration": 0.0, + "text": "hopefully now give you some intuition as" + }, + { + "start": 5343.88, + "duration": 0.0, + "text": "hopefully now give you some intuition as you<01:29:04.000> go<01:29:04.200> out<01:29:04.360> and<01:29:04.480> do<01:29:04.560> your<01:29:04.680> assignments<01:29:05.200> and" + }, + { + "start": 5345.35, + "duration": 0.0, + "text": "you go out and do your assignments and" + }, + { + "start": 5345.36, + "duration": 0.0, + "text": "you go out and do your assignments and and<01:29:05.480> sort<01:29:05.640> of<01:29:06.160> mess<01:29:06.360> with<01:29:06.480> the<01:29:06.560> leaderboard" + }, + { + "start": 5347.07, + "duration": 0.0, + "text": "and sort of mess with the leaderboard" + }, + { + "start": 5347.08, + "duration": 0.0, + "text": "and sort of mess with the leaderboard and<01:29:07.160> so<01:29:07.320> on." + }, + { + "start": 5348.15, + "duration": 0.0, + "text": "and so on." + }, + { + "start": 5348.16, + "duration": 0.0, + "text": "and so on. Thanks." + } + ], + "plain": "So today we're going to talk about\narchitecture, which at least to me has\nalways been pretty inscrutable.\nUm and so I'm going to take the approach\nof just telling you kind of everything,\nright? I'm going to go through all of\nthe modern papers.\nUm and we're going to just look through\nwhat has everyone done? Um and so I've\ntitled this everything you didn't want\nto know about architectures and\nhyperparameters because I think we all\nwished we lived in a world where the\nonly things you had to know were like VC\ndimension or something, right? Like very\nsimple, you know, theoretical tools, but\nthat's not really where we are.\nSo okay. What we are going to do is we\nare going to try to understand\narchitecture from kind of like a survey\nlens, right? The best thing to do, you\nknow, better than listening to this\nlecture even is for you to go out and\nlike train your own models and try\ndifferent architectures, right? That's\nby far the best thing to do. That's part\nof the philosophy of the course. But\nwe're not going to be able to cover the\nwhole design space of all the different\narchitectures that are out there, right?\nLike that's not something that we have\nthe compute or the time to do.\nSo my opinion is the second best thing\nthat we could do is to try to learn from\nthe experience of others, right? What\nhas What has everyone else done? What\nare the choices that they are making,\nright? And by looking at kind of a\nbroader, somewhat zoomed out picture,\nmaybe we can start to understand, oh,\nthese are the kinds of parameters and\nchoices that are sort of fixed across\nall effective architectures and these\nother ones can be sort of varied without\nimpacting\nhow the model performs, right? So I'm\ngoing to talk about, you know, basically\ntransformer variants. Like what is the,\nyou know, modern transformer starting\nwith, you know, the Vaswani paper and\nthen, you know, as we go to more modern,\nmore recent architectures, what do they\nhave in common? And then what are we\nallowed to vary? Or not allowed, but\nwhat do people vary as they go through\nthis, right?\nSo\nI think many of you have taken an NLP\ncourse of some kind or at least seen a\ntransformer, so you've probably seen,\nyou know, the very vanilla transformer\nfrom Vaswani et al. Um you know, there\nthere are some fairly standard choices\nthat you make. You say, oh, transformers\ndon't have positional dependence, so\nwe're going to add a position embedding.\nAnd what do we do? We're going to add\nsome sines and cosines.\nUm we're going to have information\nprocessing through a ReLU. Um and then\nwe're going to have a a post norm. I'll\ntalk about what exactly that is later.\nUm and when you look at your assignment,\nyour A1, you're going to notice some\ndifferences between the standard or the\nvanilla transformer and what we've asked\nyou to implement. Well, we're going to\nask you to move the layer norm to the\nfront of each transformer block or the\nnon-residual layers. We're going to ask\nyou to implement something called rope.\nUm\nand we're going to ask you to implement\nsomething called SwiGLU and not ReLU.\nRight? Why do we pick these? Um one\nreason is we've, you know, copied a lot\nof this over from LLaMA, but so did\neveryone else. Really, I think if you\nwere to train on your own language\nmodel, I think you'll quickly run into\nthis question of, oh, there's so many\nchoices, right? Like what do I choose\nfor all these things?\nAnd so let's now sort of walk through\nall these different models.\nThe way I kind of think about\narchitectures is to think about to look\nat all the different things people have\ndone and say, what are the things that\npeople have done? Can we pick and choose\nfrom those?\nUm Percy always makes fun of me for this\na little bit, but you know, I try to\nlook at all the the different models\nthat come out each year to try to make\nthis lecture.\nUm and last year I thought, oh, there's\njust a couple papers. It's going to be\nfine. It's going to be fine. And then I\nlook through all the things and there's\na lot of papers. There's Qwen 2 and\nGemma 3\nand InternLM2.\nand then there were even more. There's\nlike NeMo Tron 4 and Qwen 2 and oh oh my\ngoodness, there were 19 new dense\nmodels. And so last year I had my work\ncut out for me.\nAnd then this year, you know, I thought,\nwell, there can't be that many new LM\nreleases. Like it's got to be slowing\ndown, right? Like people can't keep\ntraining 20 dense LMs per year. Um and\nthat's technically right. There aren't\nthat as many dense LMs. Initially, you\nknow, I was like, oh, there's Qwen 3,\nGemma 4 just came out last Thursday, so\nI put that in there. And almost 3. You\nknow, there's only a couple. And of\ncourse I have to give a shout-out to\nPercy's own 8B model trained with\nMarine. And I was like, oh, we'll just\nhave a few things to cover. Um but it\nturns out if you start looking, there's\na lot of different models. Um and so the\nfact that we have so many different\nmodels, most of these actually are MoEs,\nmixtures of experts, and I'll be talking\nabout that tomorrow rather than today.\nUm because we have such a big diversity\nof models, we actually get a pretty good\npicture of all the different choices\nthat we can make. Um\nso I I made this like little table.\nWe'll come back to this little table at\nthe end of the lecture.\nUm but basically at this point, you\nknow, starting with, you know, the\noriginal transformer, there's been\nactually quite a few autoregressive\nlanguage models kind of trained on the\nsame class of things.\nUm and you can ask questions like, what\nare the different vocabulary sizes? Or\nwhat kind of layer norms do we use? Or,\nyou know, what kind of position\nembeddings do people use? And we see\nsome fairly clear trends. I'll be\ntalking about this as we go.\nOkay.\nUm so\nthe goal here is that we're going to\ncover couple different things. We're\ngoing to cover common architecture\nvariations. So these are different\nbuilding blocks of the transformer.\nUm and after we've established what the\nstandard building blocks are, like, you\nknow, what do we use for the the\nnonlinearities or what do we use for\nposition embeddings, then we're going to\ntalk about hyperparameters. We're going\nto go down even lower detail and say\nlike, you know, what is FF dim? Um\nshould we make that a multiple of four\nor like multiply the the hidden by four\nto get FF dim? How many vocab elements\nshould I have? Um\nand then after that, we're going to talk\nabout very low-level tricks of how to\nget models to train stably. And the\nreason why I'm going to talk about that\nin this lecture is because these\nstability tricks have a pretty close\nconnection with the architecture\nvariation, right? Um one of the things\nthat, you know, higher level I want to\nsort of impress upon you is that\narchitectures are actually a a very\ncomplex set of tradeoffs, right? Like\nwhat does a architecture have to do?\nWell, it has to learn from data, so it\nhas to generalize. It has to train\nefficiently on GPUs. And it has to not\nblow up, right? Like halfway through\ntraining, if your, you know, training\nlosses just go like down like this and\nthen suddenly blow up, that's no good at\nall, right? So all these different\nrequirements end up getting baked\nstraight into the architecture. And\nthat's why these things are a little bit\nmessy and a little bit complex.\nUm but you should keep that in mind and\nthat's why, you know, things aren't in\nmany ways not so elegant.\nSo we're going to start with the core\narchitecture piece. And as a high-level\nview, you know, I think the the way that\nI see a lot of the architecture stuff,\nyou know, looking basically\nhistorically, is kind of in the early\ndays of, you know, starting with the\ntransformer until, you know, GPT-3 or\nso, there's a lot of experimentation\nthat happens. People try lots of\ndifferent things. There's no like gold\nstandard that everyone has unified on.\nAnd then, you know, LLaMA 2 comes out\nand everyone's like, wow, LLaMA 2 is\ngreat. I want my own LLaMA 2. And so\neveryone starts training LLaMA 2-alikes\nwith, you know, minor variation\nthat people have. And then finally, you\nknow, last year we saw really big\ndifferences or or sort of a trend\ntowards architecture modifications that\nmake training more stable. And this year\nwe see lots of trends towards\narchitecture variations that enable\nlonger context dependence. So there are\nthese big themes that are happening, but\nreally I think, you know, you see this\nlike big point when LLaMA 2 comes out\nand everyone's like, wow, I want to\ntrain something with that. And then\nsuddenly, or not suddenly, but after\nthat, people are starting to explore\nonce again. So it's kind of cool to see\nall these different changes.\nUm I think people can disagree about a\nlot of things on architectures, but\nthere is one thing that everyone agrees\non, you know, like if you take the\ntransformer paper, I think a lot of\npeople will say like the transformer\npeople got like most of the things\ngot like most of the things\nright, except this. And the thing that\nthey really did not get right or like I\nthink most people agree they did not get\nright is where you put the layer norm,\nright? So in the original um\n? So in the original um\nuh transformer paper, you know, the\nlayer norm goes in what you would call\nthe residual path, right? So in the\ntransformer, you know, you have the\nresidual stream, this X that kind of\nruns through the whole network. And\nsort of a delta back into this residual\nstream.\nAnd then in order to make sure that\nthese gradients are sort of stable\nacross layers, you know, a layer norm is\nplaced at the end of each of these\ncomponents.\nNow, um\ninstead of putting the layer norms in\nthe residual stream, there's an\nalternative. Um I'll refer to this as\npre-norm,\num in which you can put the layer norm\noutside of the residual stream, but\nbefore each of the computations. So you\ncan put it before the multi-head\nattention, you can put it before the\nFFN, right? Um we'll call this pre-norm.\nthe nomenclature will get a little bit\nconfusing. Um You can call this uh\npost-norm for now, but let's call this\nsort of residual norm, right? Cuz you're\nputting the norm in the residual layer.\nUm basically all modern language models\nuh push the layer norm outside of the\nresidual stream. This is just like a\nthing that basically everybody does.\nUm there is one funny exception, but it\nis OPT-350M.\nAnd if you all are familiar with sort of\nlanguage models, we kind of know OPT in\ngeneral was, you know, kind of a mess of\na language model, right? And OPT-350M\num is even more so because I don't know\nwhy only that model uh has a post layer\nnorm in the residual stream.\nOkay. So this is one of the things that\nlike everyone agrees on. And so you\nmight wonder like why is this like such\na, you know,\nuh like a uni- unified thing across all\nthe different models?\nUm and if you look at some of the early\nworks studying like where do you place\nthe layer norm style research, um what\nyou really see is that, you know, the\nearly motivation for a lot of this was\nwhen you train a transformer, you need\nto do a warm-up. Actually, you know,\nmodern transformer training still does\nwarm-ups as well. But you definitely\nneed to do warm-up when you train.\nNow, wouldn't it be nice if we could\nremove the warm-up, right? So, that was\nkind of the initial motivation for a lot\nof this research.\nBut people quickly realized that\nremoving the warm-up had very serious\nissues in terms of the stability and\nconvergence\nof these things, right? So, if you did\npost norm plus layer norm, which is, you\nknow, basically the original transformer\nthing, you got this purple dash line.\nOh, you just don't converge as well\ncompared to doing something like pre\nnorm. You can ignore the other terms.\nYou would get much nicer convergence\neven without warm-up, right? So, this\nwas the original motivation.\nBut really what people kind of quickly\nrealized is that, you know, moving the\nlayer norms outside the residual stream\nhas some pretty important implications\nas you make your network deeper and as\nyou start to grapple with stability\nissues.\nTo me, I think the gradient attenuation\nissues are kind of the most clear.\nWhen you sort of talk to people who do\narchitecture design, I'm not really one\nof the the people that deeply engages in\nthis. But one of the things that people\noften say is keep your residual stream\nclean, right? So, in this case, you have\nyour X's coming in from the bottom on\nthe pre norm side, and this X propagates\nall the way up to the top, right? All\nthe way up to your your final output.\nAnd that allows gradients to propagate\nif you in the backward pass straight\nthrough this, right?\nUm that makes gradient propagation very\nsimple,\nwhich improves both stability and signal\npropagation. And that's sort of what\npeople realized very, very quickly, that\nif you do something like pre norm in\nblue\ninitialization, sort of the gradient\nsize is kind of remains the same, right?\nBecause you have this nice straight\nthrough propagation in the backward\npass. On the other hand, if you have\npost layer norm, you have these kind of\ncomplicated effects that happen because\nyou're layer norming each time you're\ngoing through a transformer block. And\nthat's going to change the norm of your\ngradients as you go backwards through.\nSo, you can kind of see,\nyou know, from the principle of keep\nyour residual stream clean, that pre\nnorm makes a lot of sense.\nPeople also realized through\nexperimentation that this also improves\nstability in general, that the sizes and\nfrequencies of gradient spikes\nwere improved under pre norm compared to\npost norm. And, you know, this is a\nfigure from Salazar and UN, who were one\nof the first ones, I think, to to study\nthis phenomena carefully.\nI think this is the reason why it stuck\naround, right? Stability and the ability\nto go deep are both very, very important\nfor modern large language models. And\nso, this idea of moving your layer norm\noutside of the residual stream is one\nthat basically everyone has adopted.\nUm so now, you know, if putting layer\nnorms in residual streams is bad,\nwhy does layer norm have to be at the\nstart? Of course, we have pre norm,\nwhich is, you know, before our\ncomputation, but we could have it after\ncomputation as well, right? That's\nas well, right? That's\nequally good at least under that\nknowledge logic.\nUm and that's exactly right. Many recent\nmodels like Grok or Gemma 2 or Olmo 2\nhave the structure where they moved the\nlayer norm after the computation. So,\nit's a post norm of a kind, but it's\noutside the residual stream.\nOther models still actually just put\nlayer norms everywhere. They put a layer\nnorm here, they put a layer norm after.\nI'll get to this later as we talk about\nstability, but one of the other lessons\nthat it seems to have held up very well\nis if you have stability issues, you can\nkind of sprinkle in layer norms\neverywhere, and that will generally\nimprove stability.\nIt's almost very strange to be saying\nthis cuz it's so ridiculous. And yet,\nthat statement has actually been proven\nright. Every time, you know, people have\nencountered stability issues, they say,\n\"Oh, but what if we just throw a layer\nnorm into attention?\" Turns out that\nworks, too. We'll get to that later. So,\nokay, that's post norm\nor double norm in this case where you\nhave two layer norms here.\nOkay. The other thing that you can do is\nin the original transformer, you have\nthe layer norm, which is this operation\nright here. So, you have your\nactivations X.\nYou're going to mean subtract, divide\nthe variance, and then scale it back up,\nright? And this works just fine, right?\nIt's not like this is wrong. And many\nmodels have successfully trained on this\nscheme.\nBut basically, most or all modern\nmodels, I think,\nuse RMS norm, which doesn't subtract the\nmean or add a bias term, right? So, it's\njust a scaling down and scaling back up,\nright? So, you can see this in the\nequation here.\nUm and really, layer norm is more\nexpressive than RMS norm. So, there's\nreally representationally no reason why\nyou have to use RMS norm.\nBut RMS norm is nice because in\npractice,\nthere's really no expressiveness loss.\nRMS norm models just as well as layer\nnorm. But more importantly, it is, you\nknow, faster, right? This is the part\nwhere kind of the systems and sort of\narchitecture co-design starts to come\nin.\nPercy mentioned, you know, in the\nprevious lecture, this idea of\narithmetic intensity, right? We want to\nkeep our GPUs hot by doing, you know,\nmatrix multiplies and other very intense\ncomputations. We do not want to be\nwasting our GPUs by having them move\nlittle tiny bits of memory back and\nforth, right? That's a very inefficient\nuse of our, you know, very powerful GPU.\nAnd so, what we want is to remove\noperations that are small and involve\nmemory movement, but don't give us much\nexpressive power, right? So, by that\nview, what we really want to be doing\nhere is, you know, if the mean\nsubtraction and addition isn't really\ndoing much for us, just get rid of it,\nright? Um\nyou might think, \"Okay, why does this\nmatter? We're just optimizing this\nteeny, tiny operation that accounts for,\nyou know, in this case, something like\n0.17%\nof the total floating point operations\nof our system.\"\nBut, you know, as Percy mentioned, it's\nnot really about the flops, right? The\nflops are the the floating point\noperations we do, that's sort of\nmultiplying matrices, but that's not\nruntime, right? Runtime is a much more\ncomplicated object. And, you know,\nstatistical normalizations, things like\nlayer norms, even though they're only\n0.17% of the flops, depending on your\nworkload and depending on the setup, can\nbe up to 25% of the runtime, right?\nThat's kind of crazy. On tiny models,\nthis can be really, really big because\nyou're still having to move all these\nparameters back and forth from fast to\nslow memory and vice versa\nwhen you're doing these operations. So,\ndata movement is really, really\nimportant, and RMS norm can still matter\na lot because of this, right? So,\num\nyou can see kind of the difference here.\nThe arithmetic intensity is in white,\nand then you can kind of see the flops\ninvolved in black. And you see that\nlayer norm has a very low arithmetic\nintensity, which is the operation we try\nto want to remove as much as possible.\nYeah, question over there.\nData movement for normalization is so\ndisproportionate compared to arithmetic\ncontraction\nSo, for a ten- Something like tensor\ncontraction, which is, in this case,\nmatrix multiplies,\nthe majority of the workload is, you\nknow, multiplying. Whereas for stat\nnormalization, the majority of the\nworkload is memory movement. And memory\nmovement is quite slow. So, imagine the\ncase where like moving something is like\nalmost all of the compute, then you're\nstill paying quite a bit here, right?\nCuz activations can be quite large.\nYeah, I think the percent runtime in\nthis case is quite extreme. This is like\nin the, you know, tiny models with like\nmatrices that don't really generally\nmake sense in modern workloads, but this\nis, you know, giving you a sense of why\nthis is a free optimization.\nUm and you do see this, right? This is\nanother paper in which people were\nevaluating different architecture\ninterventions. They're on Get All in\n2020.\nI think this was a Google paper, and\nthey show, you know, for teeny, tiny\ntransformer of a 200 million parameters,\nyou got more steps per second. That's\nthe third column over here when you\nswitch to RMS norm. And in fact, you\nactually get better performance, which I\ndon't think is something that you're\nguaranteed, but it's a nice bonus\nregardless, right? So, you got a free\nsystems win by just moving to RMS norm.\nAnd so, basically, everyone has has\ndecided to move over to this now.\nAnd in general,\nthere's a more general version of this.\nThere's no guarantee to any of the\nthings I'm saying, but bias terms in\ntransformers and neural networks\nare generally not that useful. So, in\nthe original transformer, the linear\nterms all have biases,\nbut most implementations actually just\ndrop the biases entirely, right? Once\nagain, this is another example of\nsomething that's not very arithmetically\nintense, but fairly memory\nintensive, relatively speaking. And so,\nive, relatively speaking. And so,\nyou might as well just drop these,\nright? And get the free systems win.\nThere's also some cases, I'll just\nmention this offhand, where the bias\nterms can also induce stability issues.\nSo, they're useful in other ways, but\nreally, I think the primary reason these\nare dropped is just to simplify things\nfrom the systems perspective.\nCool.\nOkay. So, I think layer norms, the story\nis pretty easy.\nUm it's easy in the sense that what\npeople do is fairly standardized. Our\nour understanding, not at like a deep\ntheoretical level, but our understanding\nof like what layer norm does is fairly\ngood, right? Everyone moves the layer\nnorm outside the residual stream,\noften pre norm, but I think this might\npartially be be because Llama 2 did\nthat. Um and we roughly have a sense of\nhow to use layer norm to control things\nlike\ngradient spikes\nand keep signal propagation nice.\nRelated to that, we also now, you know,\nbasically always use RMS norm, and you\nhopefully understand the general\nprinciples here\nof basically just dropping bias terms.\nAnd that allows us to to keep our system\nmore arithmetically intense\nwhile keeping the expressive power the\nsame.\nUm I think the unsatisfying thing about\na lot of architectures is that, you\nknow, you can't really reason about this\nbeforehand, right? Like we don't know\nbeforehand that dropping the bias terms\nthat dropping the bias terms\nis okay, but from a lot of\nexperimentation and now collectively\nacquired knowledge, we roughly know that\ndropping the bias terms on both the\nlinear and RMS norm is okay for typical\nlanguage modeling workloads, right? Um\nthis is the kind of statement that we\ncan make on the basis of of what we do\nwhen we look at a variety of different\nmodels.\nOkay, any questions uh for layer norm\nstuff?\nGood. Okay. So, now I'm going to talk\nabout activations. Um\nand there's a whole zoo of activations.\nThere's just a lot, right? Like ReLU,\nGELU, Swish ELU, GeGLU,\nSeLU, SwiGLU, LiGLU.\nUm and what are these things?\nUm I think at what point one point of\nmy, you know, more stats ML training, I\nthought to myself I will never learn\nthese things. I will make it a point of\npride to never know what a SwiGLU is. Um\nbut now it's actually very important for\nus to to actually like have a general\nsense of what these objects are um and\nwhich parts of these names actually kind\nof matter for performance, right?\nUm So, you can build and train a\nlanguage model on just a fairly vanilla\nactivation. Um\neven, you know, I guess Chinchilla is\nprobably the best model out of that\ngroup, but even if you just want a ReLU,\nyou know, you can train a reasonably\nperformant language model using just\nthat activation. There's nothing wrong\nwith that, right? And if we move to\nGELU, which is a Gaussian error unit,\nand really the only difference is this\ntiny divot at the bottom here, which\nreally, you know, for the most of the\nactivation doesn't change anything, but\nchanges the gradients right near zero,\num then you can train models like GPT-3,\nright? That's a perfectly good large\nlanguage model, not, you know, modern by\nmodern standards, but perfectly fine.\nUm but then, you know, we get to the\ngated linear units like SwiGLU and\nGeGLU, and these are really where most\nof the action is. You know, this is very\nsimilar to layer norm in that I think\nalmost all credible modern language\nmodels use a gated linear unit of some\nkind.\nOkay, so what is a gated linear unit?\nSo, these are gated activations. So, if\nwe want to look at something like a feed\nforward layer, um we can just look at\nthis first part. This is, you know, a\nvery standard ReLU feed forward, right?\nI have my X, I hit it with a W1, you\nknow, I I entry-wise threshold at zero,\nand then I hit it with another W2, I get\nmy output, right? Very straightforward\nReLU network.\nAnother thing, um I don't say this as my\npersonal experience, but another thing\nthat is often said in architecture\ndesign is that gating is often very\nhelpful. So, if you apply that very\ngeneral heuristic, what you might get is\nto say, \"Okay, well, instead of just\nhaving, you know, this entry-wise ReLU,\nwhy don't we also have a gate? And the\nsecond gate, the second term here, this\nis just going to multiply the output of\nmy ReLU entry-wise, and I have a second\nmatrix V, okay?\"\nNow, this is just going to modulate the\noutput of my ReLU.\nUm and then I'm going to do everything\nelse the same, right? So, instead of,\nyou know, just having XW1W2, I have XW1,\nand I'm going to gate that with XV. This\nis another\nuh activation the same size as this, and\nthen I'm going to, you know, down\nproject it back with W2.\nOkay, so what is this? Now, this is a\nRegLU. This is a you you make these\nnames by adding the first activation, in\nthis case ReLU, and GLU, right? So, the\nReLU gated linear unit.\nUm and gating has been a, you know, very\neffective other primitive in\narchitecture design, and it turns out\nthat this is very effective in language\nmodeling as well.\nSo,\num if you take something like a GELU,\nwe've already talked about that, right?\nThat's like the ReLU, but with a little\ndivot at the bottom here, um you will\nget a GeGLU. Um and if you take a\nSwiGLU, which is X times a sigmoid, then\nyou will get a SwiGLU. So, this is a\nSwish times, you know, the rest of it.\nUm and this really covers a lot of the\nmodern models, right? Um generally the\nGoogle folks have used GeGLU, so like\nthe Gemma models, the T5 models are\nthose. Um and everything that's kind of\nlike a LLaMA descendant uses a SwiGLU.\nUm so, PaLM and the LLaMA descendants\nare all kind of SwiGLU models. Um\nI would say that SwiGLU is probably the\nmore dominant one, but honestly amongst\nthe gated units, doesn't really matter.\nNow, here's a side note that will uh be\na semi-important piece of trivia later.\nUm if you look up here, right? Um you\nwill notice that there are more\nparameters for the gated uh model,\nright? Cuz I have this parameter of V.\nAnd so, if you do a little bit of math,\nright? I now have three matrices instead\nof two matrices,\nright? What you should do is you should\nmaybe use a smaller feed forward\ndimension by a factor of 2/3 in order to\nkeep the total parameter count the same,\nright? So, this is roughly the idea of,\n\"Well, I want to keep the same number of\ntotal parameters as my original MLP, but\nI now want to make it gated, so I'm\ngoing to make the feed forward\ndimension, which is the output dimension\nof this W, a little bit smaller by 2/3,\nright?\" So, this is a general rule of\nthumb that people have followed, but\nit's not really an iron rule.\nYou know, the original Noam Shazeer\npaper that, you know, proposed this,\nhad some, you know, very small deltas\noriginally, but they're consistent\ndeltas, and I think to his credit, um\nI think a lot of his papers have these\nlike error bar assessments of like\ntraining multiple replicates and\nchecking to see if they're better. Um\nand if you look, the GLU variants are\nalmost always consistently better than\nthe non-GLU variants. And this is a\nparameter matched comparison because um\nNoam Shazeer is always doing this 2/3\nadjustment to make sure that all of the\nmodels have the total same total number\num of parameters.\nSo, this is quite nice. It's in some\nways a free win. Um almost everyone uses\na GLU. There have been other sort of\nmore controlled systematic comparisons.\nThis is uh the same paper I was talking\nabout before, Noam et al. in 2020.\nUm Google actually in the 2020s did\nquite a few nice large-scale\narchitecture comparison papers, um\nalthough with a T5 architecture and not\nuh autoregressive uh language model. Um\nand they, you know, basically\ncomprehensively compare things like\nGLUs, and you see once again, um if we\nlook at the SwiGLU or the GeGLU or the\nGLUs in general, they do significantly\nbetter at loss or the other downstream\nmetrics, right?\nFairly compelling on paper uh on these\npapers, also clear from now a lot of\nmodel training runs that SwiGLU and GLU\nare good, right?\nSo, there's a lot of variations in\ngating, but really the important single\naxis to know is that gating uh for these\nnonlinearities is actually quite\nimportant, gives you\nuh nice boost without much of a\ncomputational cost. Um you know, that's\nnot to say that gated linear units are\nnecessary. I mean, GPT-3 was that. Um I\nthink the NeMo Tron 340B model used a\nsquared ReLU, which is a kind of a crazy\nchoice, but that works, too. Um both of\nthese models are perfectly performant,\nbut it's actually quite rare to see\nanything that's not trained on a gated\nlinear unit, right? So, evidence is\npointing towards consistent gains on\nusing these gating tricks.\nSo, those are I think the the more\nconsensus choices for things that we can\ndo in architecture. Um now, this one I\nthink is a really fun idea, but one that\nI think now the test of time has shown\nmaybe is not quite as good or maybe not\nas popular of an idea.\nUm normally, we do our transformer\nblocks serially, right? We compute our\nattention, then we compute the MLP,\nright? One after the other.\nUm if you're very systems-minded, you\nmight say, \"Well, this introduces a\nbottleneck, right? I have to wait for\nthe computation of one to do the other.\nIf they were instead in parallel, I\ncould bring to bear some new and cool\nsystems optimizations, potentially,\nright?\" So, you might ask, \"Could we\nparallelize the transformer block?\"\nAnd um this was originally an idea that\nwas in GPT-J, which is the open-source\nattempted replication of GPT-3.\nUm and kind of very interestingly, I\nthink GPT-J has been surprisingly\ninfluential in sort of propagating a lot\nof ideas. I mean, PaLM as well. Google\num is actually surprisingly bold with\nthe architectures that they do. Um but\nthe description in uh PaLM, uh which you\ncan see in their report, is kind of the\nfollowing. Instead of nesting this,\nwhich is the sequential format at the\ntop, you know, you're just going to add\ntogether the output of the MLP and\nattention layer, and just add both of\nthose back into the residual stream.\nUm if you implement this right, you can\nactually share a lot of the components.\nLike, you can share the layer norms, you\ncan fuse the matrix multiplies. This\nallows you to potentially get additional\nsystems optimizations. Um\nAnd I think a lot of the people that\nhave been influenced by Google, so\nCohere, you know, was founded from one\nof the former\nuh transformer authors, they do a lot of\nGoogle-inspired optimizations. They\nfollowed kind of this architecture. Um\nbut not very many others. Um this has\nbeen a approach that has really fallen\nout of popularity over the past, I\nthink, 2 years. Um I think mainly\nbecause optimization of the serial form\nhas gotten sufficiently good that the\nsystems gains from the second one just\nisn't worth the small hits to uh\nrepresentation power that you end up\ngetting going from uh parallel to\nserial.\nEffectively, you can think about it as\nyou've lost half of your depth, right?\nAnd that can be\nuh a deleterious\nuh thing to do to your model.\nSo, in terms of the architecture things,\nactually, you know, the fact that this\nis so short should kind of suggest to\nyou how much the original transformer\nformulation has somewhat stood the test\nof time, right? Cuz the only thing I'm\nreally talking about changing here\num is, you know, where the norms go, or\nyou know, whether we have bias terms, or\nwhether we gate the MLPs, but those are\nactually pretty minor changes compared\nto all the things that you can do.\nNow, uh, those of you that are, you\nknow, sort of carefully paying attention\nmight say, but wait, you know, there's a\nlot of transformer alternatives that\nchange the attention. Um, yes, you'll\nhave to wait until next lecture because\ntoday I'm just only going to cover sort\nof core attention based methods. Um, and\nnext lecture I'll throw in a little bit\nof a state space model stuff, but as\nlong as you're in this like dense\nattention land, actually the\narchitecture from the original\ntransformer paper is pretty close to\nwhat we do.\nSo, you see, uh, quite a bit of this,\nright? So, uh, just now going back to\nthis, blue here is RMS norm block as\nlayer norm. You see most of the modern\nmodels are sort of RMS norm models.\nSerial versus parallel layers, the blue\none's parallel, the rest is serial. You\nsee mostly serial layers. Um, pre-norm\nversus post-norm. Some of these, uh,\nones that I marked as post-norm are\nactually pre and post-norm.\nUm, and then these ones on the right,\nthese are GLUs, uh, almost always with\nthe exception of things like, uh,\nFalcon, which use a gated linear unit,\nbut almost all of these are really, uh,\ngated linear units for modern models.\nSo, you can see the trends quite\nvisually, um,\nfrom what I'm telling you.\nOkay. So, really the thing that is very\ndifferent across implementations, and I\nthink a place where a lot of the\narchitecture stuff is still in flux, is\nhow you do kind of position dependence\nand incorporate information from other\npositions, right? So, the core attention\ncomponent in some sense.\nUm, so there are lots of different ways\nthat you can encode position into a\ntransformer.\nAnd just so you know, to to remind you,\nright? This is very, very important\nbecause attention is positionally\nindependent, right? They're just inner\nproducts, so you can just shuffle them\nand attention would be the same if you\ndon't have a position embedding.\nThe original transformer had sine and\ncosine embeddings, kind of like a\nFourier transform intuition that if you\nhave sines and cosines, then you can\nkind of recover position from that no\nmatter what.\nUm, a number of other sort of large\nmodels that, you know, followed soon\nafter that used absolute embeddings,\nwhere each position had its own\ndifferent embedding.\nUm, and then, uh, several other sort of\nGoogle models like to use relative\nembedding. So, in here you're not\nadding, um, embeddings into the into the\nembedding, uh, like word vector\nembeddings, but instead you're adding a\ns, but instead you're adding a\nvector to the attention computation\nitself, right? So, if you're three\npositions off, sort of the attention\nmatrix gets a different offset added to\nit. And and, you know, models like T5\nand Chinchilla use kind of this scheme.\nUm,\nthe thing that has really become pretty\ndominant in terms of position embedding\nis this class of embeddings called rope,\nwhich some of you may be familiar with.\nUm, most models past 2024 use this type\nof embedding. And it's kind of\nremarkable given that rope, you know, in\nsome ways came out of nowhere.\nUm, originally I think this was also a\nGPT-J innovation, um,\nfrom I think, uh, sort of not very well\nknown sort of blog post and, uh, paper\ncombination, uh, from an author in\nChina. Um, but really it has some really\ninteresting ideas for for why you would\ndo something like rope.\nSo, rope, you know, is a relative\nposition embedding. And a relative\nposition embedding, let's make an\n, let's make an\nopinionated stance that I should not\ncare about the absolute position of any\nwords. So, if, you know, A uh, an apple\nappear together, even if it appears at\nthe start or at the end, right? In rope\nembeddings, they should kind of get the\nsame, uh, sort of result. Um, and we do\nknow that, you know, or and we want to\nsort of represent it in this way, right?\nSo, I have an embedding F, and I have\nanother embedding F, and these are going\nto take in the identity of the words X\nand Y and the positions absolute of I\nand J.\nAnd I want this to be equal if I take\nthe inner product of these embeddings to\nbe equal to a function that only depends\non the relative difference.\nRight? Um, and every existing embedding\nbefore it didn't really fulfill this\nequality. Like sine is not relative\nbecause it has these absolute cross\nterms that are not relative. Absolute\nposition embeddings, just by the the\nname of it, is obviously not relative.\nAnd then relative embeddings,\ntechnically these are relative, but\nthey're not kind of embeddings because\nthey're just adding to the attention\njust adding to the attention\nmatrix, right? So, there's no inner\nproduct structure that, you know, you\ncan extract out of the\nSo, given this, you might ask, is there\na nice way that we can truly have this\nrelative embedding?\nAnd the idea is very cool. Um, it's\nreally just looking at kind of uh,\nproperties about angles and cosines. So,\nwe want our embeddings to be invariant\nto absolute positions, and we know that\ninner products of any kind are invariant\nto arbitrary rotation, right? So, the\nidea is to say, I'm going to take my\nsemantic word vectors, the ones that are\nare independent of any position. So,\nthis is my starting point. And then I'm\ngoing to rotate each of these vectors,\nin this case in 2D, um, based on the\nposition that the words appear. So, you\nknow, just as a\nuh, simple example, we, let's say we\nhave the uh, sentence, we know that,\nright? We appear at position zero, so\nI'm not going to touch that at all,\nright? I'm just going to keep that where\nit is.\nThe word know is at position one, so I'm\ngoing to rotate it by some angle, right?\nAnd that's my my one position rotation.\nNow, what happens if I apply the same\nidea to, uh, the following sequence, of\ncourse we know, right? In this case, we\nand know are still adjacent, they're\nright next to each other, but their\nabsolute position is shifted, right? Of\ncourse, you know, comes before we know\nnow. In this case, I'm going to rotate\nthe word we by two positions because\nit's two index, right? 0 1 2. So, the\nword we is in the second, uh, position\nnumber two, so I rotate by two. I rotate\nknow by three positions cuz it's in\nposition number three, and what do you\nknow, the relative angle between these\ntwo is still separated by one, right?\nSo, this is a very, very simple idea of\njust using rotations, uh, to represent,\num,\nposition. And if we do that, then\nanytime we take an inner product, those\ninner products are going to be invariant\nof absolute positions.\nNow, you might say, well, in two\ndimensions that's pretty easy cuz you've\nonly really got one choice, you got\nclockwise and counterclockwise, but in\nhigh dimensions, there's an infinite\nspace of ways that you can rotate\nvectors. So, what do you do in D\ndimensions? Um, well, you do the\nsimplest possible thing and it works.\nThe simplest possible thing is to reduce\nit to the 2D case repeatedly. So, you\nhave a D-dimensional vector, just cut it\nup into chunks of two, and each pair of\ntwo dimensions gets rotated. And the\ntheta at which these things rotate vary,\nright? Some of them are very low\nfrequency, so they rotate very slowly,\nso they uh, they can capture long-range\ndependence. Some of them rotate very\nquickly, so they capture things like,\nare they neighbors to each other, right?\nUm, and then at the end, you know, after\nI've rotated every pair of vectors, I\nget sort of my final embeddings. So,\nthis is the, you know, rope approach.\nthe paper, if you read it, has a very\ncomplex motivation about complex\nnumbers, but really I think the\nintuitive way, at least to me, to think\nabout it, is to just you want to rotate\nby reducing to the two-dimensional case,\nand you're just rotating every pair of\ncoordinates.\nGemma 4 just came out on Thursday, and\nthey have like another different kind of\nfun thing that they do, which they call,\num, I think proportional rope or P-rope,\num, which is a really strange way to\njust say that the only thing they rotate\nis the first two coordinates, but that's\nanother valid thing that you can do as\nwell. So, there's a lot of different\nthings that you can do in this space\nthat end up working.\nOkay. In practice, what you're going to\nend up doing is, you know, you can take\nyour vector and you can make a sparse\nmultiply with sines and cosines, and\nthis is going to be giving you some way\nof rotating your input vectors X's,\nright? So, X times, uh, W times R, this\nis going to be your final embedding that\nyou get.\nUm, and finally, you know, this is a\nsine and cosine, which looks a little\nlike sine embeddings, but it's really\nimportant that I'm multiplying with\nthese sines and cosines rather than\nusing them as embeddings cuz that means\nthat there are no cross terms. Um, and\nthis is purely relative, right? There's\nno absolute position information that\nyou'll get out of inner products.\nUm, if we really, really wanted to get\ninto low-level details and you ask like,\nhow do I actually implement this thing,\nyou know, you're going to have to do\nthat. Um, you have your usual attention\nstuff, and then what you do is you\ngenerate cosine and sine angles, um,\nbased on the position IDs of sort of\nwhere your sequence is, and then you're\ngoing to apply those cosines and sines\nonto both your queries and keys for your\nsort of attention computation, um, and\nyou can either apply them as a matrix\nmultiply or you can go through and apply\nthem manually, uh, just as a rotation,\nright? Fairly straightforward, and you\nwould do this at the attention level\nrather than at the very bottom to sort\nof enforce position invariance every\ntime you're doing attention\ncomputations.\nOkay. So, that was rope. Um, it is a\nlittle bit confusing, but once you\nunderstand the geometry of just rotating\nthings, it's actually fairly, uh,\nstraightforward.\nOkay. I'm going to pause here for one\nmoment, um, in case anyone has any\nquestions about the various like\narchitecture bits. Um, we're going to\nthen talk about even lower-level details\nabout hyper parameters. So, yes.\nDo you know about any papers that do a\nhigher-dimensional rotation?\nHigher-dimensional rotation never\nworked?\nIt's a good question.\nI don't think so. By a\nhigher-dimensional rotation, like any,\nyou know, 2D rotation in the space would\njust be kind of a variant of this. You\ncould certainly do like any one manifold\nthat like is a closed loop. I have not\nseen that.\nYes. What do you recommend for this?\nWhat do you think is the best way to\ndistill this kind of knowledge problem.\nPeople who are back to work boards.\nIt's a good question. Um\nI don't know if there's a way beyond\nsome combination of like looking broadly\nenough to get a to get to get a pattern,\nwhich is what I the procedure I'm trying\nto do in this lecture here. And then the\nother one is to try it yourself even a\nmuch smaller scale to form an intuition\nand like a theory for how these things\ncome together. I think those two are\nreally the right ways. I think reading\nany single paper in isolation is very\nvery difficult especially now because no\nsingle paper seems to give any full\ndetail for a lot of language models\nthese days.\nOh, lots of questions now. Okay, good.\nWe'll go in Yeah. Um so I have a\nquestion about the question on the\nparallel layers and the the serial\nlayers. Yeah, I understand the modern\nmodels are\nthinking of the resource efficiency. So\nthey will use the parallel layers. They\nhave the\nidea but there's there's a difference\nthere's a there's a big difference\nbetween the accuracy um for these two\npatterns, right? I want to know like\nwhat's the What's the\nWhat's the difference of of the\naccuracy? Is it big enough to Is it\nsmall enough to allow the current model\ntrainers to ignore that or is there any\nproblem? Yeah, I think you know, the\nthat's actually really mixed. So if you\nread the original Palm paper, I think\nthey're like very confident about the\nuse of parallel layers like no\nperformance drop 15% systems utilization\nimprovement. So if you read just that\nyou'll kind of say like oh, it's just as\ngood. Um but I think a lot of the the\nlater Google models have stopped using\nthis, which you can take on as an\nimplicit signal that actually there\nmight be some losses. And once again,\nthis one is a little bit hard to um to\nget precise numbers on because no one's\ndone the ablations as far as I know on\nparallel versus serial um controlled\nnice ablations at least.\nYeah.\nSo yeah, so\nwhat's the difference between like Eagle\nand RoPE?\nYeah, yeah. I mean this difference is\nreally just like which of the\ncoordinates you're rotating.\nLike you don't rotate most of them\nbecause a lot of the I mean the argument\noriginally I think is that the low\nfrequency parts just aren't rotating\nvery much. And so you can drop them if\nyou're really strapped for you know,\nsort of extra space. And these this is\nreally a optimization for teeny tiny\nmodels where like you don't have very\nmuch like hidden dimensions to to have\nactivations for.\nFor the relative embeddings not having\nan inner product,\num is that cuz it only applies to keys\nspecifically? I'm trying to understand\nthe logic. Yeah, so they applied both\nthe the keys and values, which is kind\nof why you know, you get this like\nrelative effect from where you are. Um\nyou want to not have cross terms, right?\nSo so if you look at the sine and cosine\nembeddings, then you'll not only get\nsort of the you know, the original\nvectors, you'll kind of get these weird\ncross terms between the position\nembeddings and the word embedding\nthemselves and so on and so forth. And\nthen you can kind of back out what the\nabsolute position is. So even sine and\ncosine embeddings are not like pure\nrelative position embeddings. Um\nyou know, you have to accept the premise\nthat you know, the relative embedding is\nwhat you want. But once you do kind of\nyou end up at the RoPE solution somewhat\nnaturally.\nSo what's the issue with So the issue\nwith this is that it just can't be\nfactorized as an inner product. That's\nmore of an aesthetic problem, right?\nLike if if your constraints are I need\nit to be uh relative and I need it to\nfactorize as f of xi and f of yj, then\nthis is not a solution in that class. Um\nto be fair, there's a lot of um\nembeddings that work this way that do\nwork like Alibi and other kinds of like\napproaches like do do this kind of\ninject into the attention matrix and\nthey do reasonably well. Um it's not\nnecessarily the one that's become the\ndominant approach is what I can say.\nCool. Okay.\nGreat.\nNow we'll talk about hyperparameters um\nand I think hyperparameters are really\nsomething that you start to engage with\nonce you like actually have to train a\nmodel, right? When your knowledge about\nlanguage models are abstract, you don't\nhave to care about any of these. But\nonce you have to instantiate it, you\nstart to ask questions like well, how\nbig should the feed forward size be?\nUm how many heads should I have?\nUm what should my vocab size be, right?\nand you might also have questions of\nlike what should my weight decay or\ndropout be? Like do I even need to\nregularize? I I have a lot of tokens,\nright? So do I need regularization?\nUm and do I need very deep models or\nvery wide models? Like what are the the\nright kinds of things to do here, right?\nUm and all of these if you start out\nwith no knowledge, it's actually very\ndaunting because you have to search this\nlike very big high dimensional space.\nUm but the space of things that people\ntry is actually pretty small. And from\nthat maybe you can start to think about\nyou know, smarter search processes of\nlike where you want to vary things.\nOne of the things that's a really\nconsensus hyperparameter\num is this idea of the uh\nratio between the feed forward size,\nwhich is kind of the output of your\nfirst matrix multiply in an MLP, and the\nmodel dimension, right? So this is\nreally the the uh ratio of the two\ndimensions of your W1 and your as well\nyour W2 matrix.\nUm this seems like a thing that's very\nimportant and controls kind of the\nrichness of your MLPs. So what should it\nbe? Well, for whatever reason, it should\nmaybe be four times your hidden\ndimension, right? Um and this is a rule\nof thumb that works remarkably well and\nI will show you some data on like why\nmaybe this is a fine number to choose.\nThere's a few exceptions um and funnily\nenough, the really extreme exceptions\nkind of backtrack on that.\nUm exception number one is variants of\nthe gated linear unit. I already told\nyou about this. So if you were thinking\nabout it, this is probably cached in\nyour head, right? GLUs have more\nparameters, right? If you keep the same\ndimensions. So if you want to keep the\nparameter size of your MLPs the same,\nwell, you need to scale down by 2/3,\nright? So most GLU variants, this means\nthat you're going to end up with\nsomething like 2.67-ish,\nright? So everyone that's uh down here\n2.67 to 2.5, this is roughly applying\nthis like 2/3 correction.\nUm and then for whatever reason, um the\nLlama 2 folks decided, well, we actually\nhave very efficient um attention heads\nwith like um\nuh\nuh MQ A, which I'll talk about later. Um\nand because of that, we can multiply\nthis ratio by an arbitrary 1.33 and\nwe'll get roughly 3.5. And so the Llama\npeople kind of like arbitrarily chose a\nslightly different ratio, which\nessentially emphasizes the MLPs a little\nbit more. Um but really if you actually\nlook through all of the the papers,\nyou'll find you know, either 2.6-ish or\n3.5 for GLUs um or four if you're doing\nuh non-GLU models.\nOkay. There's another exception, which I\nfind to be very funny but also very very\ncool, which is um\nyou know, throughout as you read these\nlike technical reports, you'll find that\nmost people are just very boring in\ntheir choice of architectures. They're\nlike we did Llama but we changed one\nthing. Um but you know, folks at Google\nare very bold sometimes um and T5 is one\nof my favorite ones because they have\nsome really bold settings. Uh they\ndecided that um instead of following\nthis like 4x rule of thumb, they decided\nthat they want to have a 64x\nmultiplier, which is like way bigger\nthan four. Um and they have a reasonable\nargument for this as well. This is\nanother like systems-based argument,\nright? They said, well, you know, if the\nbigger my matrix multiplies, the more\nefficient I can keep my hardware. So if\nI make this, you know, multiplier really\nbig, then you know, my matrix multiplies\ncan potentially be sort of more\nefficiently utilized, right?\nUm and some others like Gemma 2 have\nalso tried to really push a little bit\nhigher on this. But really uh T5 is an\nkind of astounding exception at 64. I\ndon't think any other model has really\ngone that high in the feed forward\nmultiplier.\nand empirically, if you look at other\nsort of works that try to do more\ncontrolled comparisons of this ratio, um\nI've taken this one from Kaplan in 2020.\nThis is the classic uh neural scaling\nlaws paper um where they they do sort of\nvarious controlled uh studies on\nlanguage models. You'll see, you know,\nthis wasn't the point of the study,\nright? It was a scaling laws study. But\nyou'll see in one of the panels that\nthey actually have a sort of ablation or\nsweep where they change the feed forward\nratio and they look at the loss, right?\nUm for a very small model here, right?\nBut what they what they find in this\npaper is there's a basin where you start\nat about one and you end up about maybe\n10 where this hyperparameter is like\npretty good and very very flat. You lose\nvery little relative to the optimal loss\ndown here, right? Um and then if you get\nit really wrong, like you get, you know,\nabove 10 to 100 or something like that,\nyou know, then your loss starts really\nshooting up quadratically.\nUm and so a lot of these choices that\nrange between like 2.6 to four, they're\nall kind of falling into this relatively\nnice basin. So you're fine choosing\nthose numbers, right?\nOkay. So what can we learn about this\nhyperparameter? Well, the default\nchoices have worked very well for nearly\nall modern language models. So you can\nsafely choose that. Um T5 was a fine\nmodel or the Virgin 1 T5 was a fine\nmodel, right? Like it wasn't a bad\n, right? Like it wasn't a bad\nmodel. Um and so even radical choices\n. Um and so even radical choices\ncan technically work, but it's probably\ngoing to be compute inefficient. And I\nthink the funniest part of the saga of\nthis kind of the punchline of the T5\nsaga to me is that they have a follow-up\nmodel T5 1.v v1.1 um that's like\nsupposed to be the improved version of\nT5 and they kind of go back to the\nstandard 2.5 multiplier, you know? So\nthere's nothing explicitly stated here,\nbut clearly, you know, when they tried\nto to update T5, they decided that they\nwanted to go back to a more standard\nmultiplier, which I find to be a little\nbit funny.\nOkay. So, that's the, you know,\nfeed-forward ratio,\num which now you have like a rough sense\nof like what the right order of\nmagnitude is.\nNow, let's talk about a different\nconsensus hyperparameter. Um I always\nfound this to be very strange when sort\nof teaching uh 224N and, you know, just\nsort of teaching students about this,\nwhich is, if you have a multi-head\nattention, where you have multiple heads\nfor your attention in your transformer,\num the canonical thing to do, the thing\nthat almost everyone does, is if you\nhave multiple heads, you make sure that\nthe size of those heads, the head\ndimension, is such that you sort of have\nthe same dimension as a single-head\ntransformer, right? So, you always make\nsure that you sort of divide the hidden\ndimension to basically multiply with H.\nSo, in this case, right, you have H, the\nnumber of heads, and the dimension of\neach head is D over H, so you multiply\nthe two and you get D, right? For some\nreason, this is kind of the rule of\nthumb.\nUm of course, this doesn't have to be\ntrue. We can arbitrarily change the the\nratios between head dimensions and model\ndimensions, but most models do follow\nthis guideline, and it turns out to work\npretty well. Um\nYou know, we can look at a variety of\ndifferent models, classic and new. I,\nyou know, have the latest and greatest\nquad as well, and you kind of find,\nyeah, the ratios are roughly around one\na model head. Um you know, notable\nexception of T5, um and even Lambda,\nwhich is another Google model, um but\nreally everyone sticks around one. And I\nthink this is\nkind of an interesting one.\nUm I think the thing about head\ndimensions uh that I'll that I'll end\nwith here is I think this is yet another\nkind of forgiving hyperparameter. Um\nthere's a couple of ablations that\npeople have done. There's once again a\npretty wide basin around one that you\ncan sort of get away with.\nOkay, but that one's maybe not the most\ncritical uh hyperparameter.\nI think maybe one of the most critical\nand interesting ones, I think\nconceptually, is this idea of an aspect\nratio, right? Um and then sort of to add\nan extra point here,\num when you scale models up or down, the\nway you usually do that is you fix an\naspect ratio, like how wide your model\nis versus how deep it is, and then you\nmake the whole model bigger, right? So,\nthe aspect ratio in some sense controls\nthe entire depth-to-width tradeoff as\nyou make models bigger, right?\nNow, you might wonder how deep should my\nmodel be. Like, if you've been following\nall this stuff on like reasoning and so\non, you might think I need a really deep\nmodel or really shallow model if I want\nsystems utilization. You might think\nthat there's a lot of sort of variation.\nAnd there is a lot of variation, um much\nmore so than other hyperparameters, but\nthere's actually like a fairly clear\nsweet spot that most modern models fall\ninto.\nUm you don't really see models go like\ntoo uh too deep, um and you also don't\nsee models go too wide uh in either\ndirection, right? You see most models\nhave a ratio about a hundred um D model\nover N layers. Um so, about hundred sort\nof width for every layer that you have.\nI mean, this is true for like GPT-3 or\nLLaMA or any one of these models.\nand really, I think the considerations\nare partly a tradeoff between\nexpressiveness and hardware, right? If\nyou have an extremely extremely deep\nmodel, um they get very very annoying to\ndeal with systems-wise. The deeper your\nmodel, like, what is the ways that you\nhave for parallelizing them? Well, you\nmight have to cut up your layers. If you\ncut up your layers,\nwe'll talk about this in the systems\nlecture. Once you start cutting up your\nlayers um depth-wise, you have very\nserious issues in parallelization.\nPipeline parallel, which is what this is\ncalled, is something that like most\npeople really really do not want to deal\nwith. Whereas width is much easier to\nparallelize. If you have a really wide\nmodel, you know, you can cut that up\nvery easily in your GPUs. Uh tensor\nparallel is what it's called is much\nmuch simpler to deal with.\nand so, in some sense, you know, there's\nsystems reasons to go wide, and maybe\nthere's expressiveness reasons to go\ndeep, and you end up at roughly a\nhundred. Um and I think one of the\nreally interesting things about um\ntransformer hyperparameters is there are\na lot of hyperparameters that seem quite\nimportant, but they're also fairly\nforgiving, and people have converged\nroughly on the minimum. This is yet\nanother plot from Kaplan et al., um\nwhich shows another sweep over\nhyperparameters for differently sized\nUm and once again, you see, regardless\nof kind of the size of your model,\nroughly speaking, the optimum aspect\nratio is fairly similar, and they live\nat about a hundred, maybe a little bit\nless depending on how you want to do the\naccounting, but really, you know,\nanywhere near a hundred is a pretty safe\nbet for aspect ratios.\nUm ETA and others uh did a number of\nreally interesting sort of like\narchitecture um\narchitecture variation experiments, in\nwhich their general conclusion on this\nwas that let's look at the top panel\nhere. You have a lot of different kinds\nof\nuh models that you can have in terms of\ndepth-to-width tradeoffs, um but as you\nsort of sweep the depth-to-width\ntradeoffs, you find that really, um the\nonly thing that matters in some sense is\nFLOPs. As you increase the FLOPs, the\nmodels get better, and that's really\ncontrolling the majority of the effects,\nnot necessarily uh the aspect ratio. And\nso, I think what has really emerged from\nthis is the sense that there's a general\nforgiving band of hyperparameters that\npeople tend to choose, and then you\nreally worry about primarily your\nsystems utilization rather than sort of\nexpressiveness concerns, which are hard\nto reason about.\nUm okay. And then maybe the last\nhyperparameter thing uh that I want to\nmention is vocabulary sizes. Um\nand this one's kind of interesting to me\nbecause there's a really clear\ndifference between two classes of\nmodels. Um I think in the early days of\na lot of, you know, um\nuh early days of open-source model\ntraining, um\nthere were a lot of monolingual models\nwhose only goal was to be good on\nEnglish. And for those models, you had\nthese like much smaller vocab size, in\nthe 30,000 range. Um and then,\npost-LLaMA, a lot of people were really\ninterested in multilingual or like\nproduction systems. So, these include\nclosed-source models like GPT-4. Um all\nthese have much much larger vocab sizes,\nand these are roughly in the hundred to\n200,000\num vocab range.\nAnd you see generally that, you know,\nGoogle models have a ton more vocab. Um\nLLaMA derivatives roughly range at about\na hundred uh thousand tokens, and then\nthe the sort of monolingual models are\nabout 30,000.\nUm this is somewhat clear. The\nmultilingual models really do need much\nlarger vocab to cover the whole space.\nGenerally, the models on the right are\nalso bigger. There have been scaling law\nstudies showing that the bigger your\nmodel, the larger the vocab it can\nhandle, and so this is also partially\ndriven by uh modern scaling trends,\nwhere the models on the right are\ngenerally bigger. No one's training\nsmall uh sorry, large monolingual models\nuh anymore.\nOkay. So, um yeah.\nSorry, uh the question was like, if you\nhave Sorry, multilingual models or\nsorry? Multimodal. Multimodal. Yeah, so\nI guess it depends on the way that your\ntokens are encoded, but, you know, if\nyou're tokenizing your images and things\nlike that, then you need to, you know,\nhave many more tokens to account for\nthose. Um if you look at like various\nopen-source releases, they'll have like\na different image tokenizer with its own\nvocab, which is quite large.\nUh how valid is it to\ncompare\nbits uh bits per byte for different for\ndifferent tokenizers?\nhow valid is it to compare bits Oh, that\nis a great question. Okay, yeah. Uh\nthat's not a hyperparameter question,\nbut that is a good question. Um so,\nwhat is the right way? Okay, so so, let\nme let me like\nstep back a moment and like put us in\nthe right mindset. So, if we think about\nlanguage modeling, language modeling is\nis a generative modeling task, right? We\nare modeling the probability of a\nsequence.\nNow, as long as your sequence is fixed,\nright? It's the same. You have you know,\nadulterated it anyway, and you provide a\nprobability over all strings, that's\nalways valid to compare, right? At that\nlevel of things, it's always valid.\nNow, when you ask the question, is it\nvalid to compare the bits per byte of uh\narbitrary token or or two arbitrary\ntokenizers?\nReally, there's two things at play. The\none thing is, you know, did you touch\nthe sequence at all? Like, if you look\nat some tokenizers in the past, before\nsubword tokenizers, they would drop some\ntokens or drop some words. That changes\nthat makes the comparisons invalid. But\nmodern tokenizers are complete. They can\nmodel any sequence, so that's not a\nconcern. Um the other thing that you\nhave to worry about is, are we like\nlength normalizing it in any way, right?\nBut for bits per byte, you're always\nnormalizing with the same number, which\nis the number of bytes, and so this is\nalways a valid comparison, right? So,\nthat's kind of how to think about, you\nknow, tokenizer comparisons. So, for\nexample,\nuh I think they had the results showing\nthat\ncomparing perplexity for fixed\ntokenizers\nis is is is um\nalways leads to to better actual\nperformance.\nOn on downstream network tasks.\nIs the same thing they were looking for?\nUm perplexity and BPD are kind of dual\nto each other, so yes, if that's what\nyou're asking.\nIt's only yes and only no, cuz if you're\ncomparing\nyou could two frame compare the\nperplexity as compared\nbut you're changing it different splits.\nChanging it different [snorts]\nOkay, we'll have to we'll have to talk\nlater cuz I'm not sure I understand the\nquestion, but I think that that's an\ninteresting set of questions. Okay,\ngood.\nAll right.\nSo, um you know, we're we're going\nthrough really the the the low-level\nlowest levels of details of language\nmodeling, which I think has really\nexposed a lot of interesting ideas while\nwe sort of talk through this. And I\nthink dropout is one of the end\nregularization, I think is another very\ninteresting class of ideas. Also one\nthat I think is very counterintuitive\nfrom your machine learning 101\nintuition.\nSo, let's uh\ngo through what I think is like the the\nstandard argument for, you know,\nregularization. Um well, if I'm doing\nlanguage modeling, I have a lot of data,\nright? I have more data than I can\nprocess most of the time, right? Unless\nyou're at, you know, Google, maybe even\nthen, there is more internet data than\nthere is flops. So, you know, I'm\nprobably not even going to see the same\ndata twice, right? Um so, I'm only going\nto do a single pass on a corpus, and\nthere's very good reasons and arguments\nto believe that a single pass of SGD or\nother optimizers is never really going\nto memorize my data very much, right?\nSo, this means overfitting is not really\na problem uh almost ever during compute\nconstrained language modeling.\nNow, you know, some people even actually\nonly look at training loss because they\nbelieve so strongly that overfitting\ndoesn't happen in single pass SGD.\nNow, given this, you know, you can sort\nof sit and think about this. Should I\nuse dropout or weight decay in language\nmodel training, right?\nOkay, you can think about it a bit.\nyou know, one unfortunate thing is that\na lot of recent models don't talk about\nthis stuff at all. Um it's really\nlower-level details than like tech\nreports are willing to expose.\nUm but if you look, actually you find a\nlot of models um do both. Especially\nweight decay actually is a fairly\npopular intervention even for modern\nhigh-performance language models. Um\nthis is very, very surprising, you know?\nI mean, some of the dropout things um\nyou know, maybe\nuh have gone out of favor, but weight\ndecay actually remains fairly popular.\nAnd this is very mystifying. Like, why\nis this?\nUm and this is, you know, one of the\nreasons why I think deep learning is\nhard and this architecture lecture is\nvery strange and hard. Um it's because\nthese things interact in very strange\nways. So, there have been papers that\nhave argued um and shown nice evidence\nthat weight decay is actually not a\nregularizer sometimes. It actually\ninteracts with the optimizer to\nessentially um\nmake optimization better. Um so, if you\nlook at the training versus validation\nloss across different weight decay\nsettings on, you know, language model\ntraining for single pass SGD, you don't\nreally see any difference. Like, weight\ndecay isn't shifting things so the\nvalidation loss is better. There's\nalready no overfitting. We're on the x\nequals y line here, right? So, doesn't\ncontrol overfitting,\nbut if we kind of look at different\nlevels of weight decay, and not only\njust different levels of weight decay,\nwe look at weight decay combined with\nlearning rate decay, um what we find is\nthat the stronger weight decay runs,\nthese blue dash lines on the bottom, you\nknow, do significantly better because\nthey start out slow, but they\nessentially end up um converging to a\nmuch better minimum later. And this is,\nyou know, generally true when we decay\nlearning rate, not necessarily true when\nwe're in constant learning rate, which\nis maybe somewhat more of where your\nintuition is coming from.\nSo, you know, this is part of why it's\nvery difficult to reason sort of a\npriori or like from scratch, you know,\nthe behavior of all these different\nchoices and why, you know, I think Percy\nand I have designed this class so that\nyou interact with stuff because, you\nknow, you might come upon this thing\nthat where basically weight decay is\nactually an optimization intervention\nand not necessarily a regularization\nintervention, which is, you know, what\nyou would expect here, right? So, always\nkeep that in mind that these kinds of\nunexpected effects can really start to\nkick in\nuh for these kinds of uh settings.\nAll right. So, to put everything\ntogether for hyperparameters, there's\nactually for, you know, a lot of the the\nmaybe more hairy-looking\nhyperparameters, actually just fairly\nstandard choices that have worked well\nfor everybody, right? You know, factor\nof four rule of thumb, keep your head\ndim and your number of heads uh equal to\nthe model dimension, um pick an aspect\nratio roughly around 100, um\nand, you know, if you ask about\nregularization, right? You want to maybe\ntry a couple things cuz regularization\nactually does interact with optimizers\nin ways that are quite counterintuitive,\nright? So, this is the thing that some\npeople uh still do even though you you\ndon't need the regularization at all.\nActually, maybe I'll stop here in case\nyeah.\nAre there any significant differences\nmaybe for like um\nthe future models?\nOoh, diffusions.\nThat I have not looked into enough, to\nbe honest. Um there aren't that many\npeople training big diffusions is one\nissue. Um and many of the models that\nhave been trained are retrofitted cuz I\nthink the architectures are actually the\nsame as the, you know, like a Lama-like\nmodel. Um but if you're asking the\nquestion of like, what's the optimal\narchitecture if you were to train from\nscratch, I don't know what that is\nactually off the top of my head.\nYeah. Do you have any explanation for\nwhy regularization works in some cases?\nWell, I guess it's not that\nregularization in general affects\noptimization. I don't think people do\ndropout anymore because, you know, it\ndoesn't really uh interact well with\noptimization. But for example, weight\ndecay, you know, which is shrinkage to\nzero, um that might allow you to use a\nhigher learning rate or it might allow\nyou to decay faster. There are lots of\nways in which all these terms are\ninterrelated.\nNow, I've talked a lot about how to\ndesign um expressive models by sort of\nlooking at all these other models that\nhave been trained.\nUm one of the things that I'll I'll\nhighlight now is over the last few years\num a really big emphasis has not been on\nperformance alone. It has actually been\non stability. And this becomes an\nincreasingly important concern as your\nmodels get more and more expensive to\ntrain, right? Um we've kind of seen that\na lot of these choices are forgiving,\nright? Everyone's kind of doing similar\nstuff. And so, you know, you can mess\nwith these, but you're not going to get\na big performance difference. That's\nfine. But if your model, you know,\nsuddenly blows up some part into\ntraining, like you get these like\nhorrible-looking spikes all over the\nplace, um you know, you might end up\nwith a model that is, you know, actually\nnot very good quality, right? Or it\nmight be unrecoverable. You might have\nspent, you know, millions of dollars in\ntraining, and, you know, you get to a\npoint where the model is no longer able\nto be trained any further, right? That\nwould be a horrible thing to happen if\nyou have a lot of compute that you want\nto spend.\nSo, you don't want to train models that\nlook kind of like this blue curve with\nlike spikes everywhere and these, you\nknow, big gradient norms happening. Um\nso, what do we do to fix these stability\nissues? I mean, this is really, I would\nsay, like a core core issue.\nAnd, you know, if you have stability\nissues in language models or in general\nneural networks, there's a few, you\nknow, usual suspects that you've got to\nstart looking at.\nUm one of them is the soft maxes, and\nthe soft max has two things that are\nboth really bad for stability. One of\nthem is an exponential, right? We can\nsee how that blows up very quickly. Um\nyou also divide two numbers, and that's\nalso a potentially very dangerous\noperation, right? So, a soft max is one\nplace where you got to be extra, extra\ncareful.\nAnd where are the soft maxes in a\nlanguage model? Well, there's two of\nthem. There's one on the output side\nwhen we output our probability\ndistribution, and then in attention when\nwe normalize the attention, there's\ngoing to be another soft max, right? So,\nwe can think of both of those as really\nkind of danger zones for our model, um\nespecially the attention.\nBut okay.\nLet's start with thinking about the\noutput soft max. The output soft max can\nblow up on us. Um\nand one of the things that we can do is\nwe can try to control\num sort of the the normalizer problem.\nSo, you know, let's sort of think about\nthe soft max calculation. We want to\ncompute a log probability to compute the\nloss. Now, what is a log probability?\nWell, it's, you know, the output of your\nmodel U, and then you've got this log\nnormalizer, right? This U is\nwell-behaved because in some sense this\nis the output of your model, right? This\nis just the output of your residual\nstream with all the things that are\nadded in. So, if U is well-behaved, then\nlog P, the first term, is well-behaved,\nright? If the model is being okay.\nNow, the second term, this log Z, this\nmight not be so okay, right? If Z is\nreally big or really small, even if the\noutput of your model is somewhat\nwell-behaved, it could blow up. And what\nis Z? Well, it's an exponential, right?\nSo, it could potentially blow up very\nquickly on you.\nOr if this is zero, it could also blow\nup on you, right? So, both of those\ndirections are very, very bad.\nNow, we would ideally like our Z to be\nsomewhere near one, right?\nUm or log Z to be somewhere near zero.\nUm what can we do? Well, one of the\nthings that you notice, right? If you\nsort of thought about the action of the\nsoft max, is this whole thing is\noverparameterized, right? Um I could\nsort of push things in and out. So, if I\nadd a constant to U, I can manipulate\nthe Zs without really affecting the\noutput of the soft max, right? You can\ncancel out between the normalizer and\nsort of the output of my model.\nSo, because of that property, one thing\nthat I could do is I could add a\nregularizer. Um this is from from Jacob\nDevlin's paper 2024, uh sorry, 2014, um\nin which he adds sort of this squared\nlog Z term. Um\nand what this is doing is it's just\npenalizing how far away your log Z is\nfrom zero. And if log Z is near zero,\nthat's nice because this whole\nexpression is kind of numerically\nstable.\nThis is called the Z loss trick. Um it's\nbeen used by a number of papers. Um\nJacob Devlin and others uh sort of\npopular or initially pioneered this back\nin 2014, and then it's become popular\nagain through a number of open-source\nmodels. Like, Baichuan I think was the\nfirst open-source model to do it, but\nthen DCLM and Almo and others have been\nusing this trick to stabilize their\noutput soft maxes.\nSo, this is this is a surprisingly\neffective thing.\nNow, okay. So, let's say we've handled\nthe instability issues on the output\nsoft max.\nNow, we have to sort of turn our\nattention towards the other potential\nproblem, which is attention, right? And\nthis is a a place where lots of\ndegeneracies happen. Lots of techniques\nhave been developed to control the\ninstability that attention operations\ngenerate.\nUm and really the, you know, the\nhigh-level thing that I'll say\nis\nif you have instability, if you can\nthrow a layer norm in there somehow, it\nmight control it. And that's really in\nsome sense the the design philosophy\nbehind this idea called the QK norm. Um\nso, what you do is remember that we\nhave, you know, our Qs and Ks um that\nare going to be multiplied together, and\nthen they're going to go into the soft\nmax, right? So, in the standard\nattention operation, I'm going to layer\nnorm as a pre-layer norm, multiply with\na QKV, and then I'm going to get my Qs\nand Ks. Those will get multiplied by a\nmatrix multiply, I'll soft max them, and\nI'll multiply that with V to get the\nweighted average, and then I'll output\nwhatever comes after, right? So, this is\nour usual attention. Now, what happens\nif we just throw in a layer norm before\nwe multiply the Qs and Ks? If we do\nthat, then we know that the inputs to\nthis matrix multiply, and therefore the\ninputs to the soft max, roughly have the\nsame scale. They're always going to have\na scale of roughly one because we've,\nyou know, used RMS norm to divide the\nsize of those Qs and Ks.\nOkay. If we do that, then, you know,\nwe're kind of going to keep this soft\nmax operation stable. Tons of different\nmodels do this. It's originally from\nthe multimodal world.\nYou know, some folks who were doing\nmaking multimodal models sort of\ninitially discovered QK norm.\nE to fix and chameleon really, you know,\nused this and like proved it out. And\nthen a number of other open-source\nlanguage models, you know, realized that\nthe same tricks are entirely applicable\nto\nstabilizing attention for language\nmodels, and I think this is now very,\nvery standard. Like QK norm is actually\na very standard intervention that most\nof the large models now introduce. Um it\ndoesn't seem to affect performance um\nfrom lots of different training runs,\nbut it does definitely prevent the kinds\nof um attention degeneracies. Um and,\nyou know, I I'm really\n>> [laughter]\n>> the the way that I've seen this is, you\nthe the way that I've seen this is, you\nknow, we have layer norms initially in\nthe pre-norm. Now, we add them after the\nthe\nnonlinearities in each block, and now\nwe're throwing them in both the Qs and\nthe Ks. And really, I think this is is\nis kind of getting at\nthe stabilization tricks that people\napply to this world. Okay.\nNow,\num the final set of things that I'll\ntalk about as a stability intervention.\nAnd frankly, this one is not as popular\nand more of a of a Google-specific trick\nthat I've seen. Um but uh logit soft\ncapping is a much harder intervention\nthat some people apply. So, this one, um\nyou know, in QK norm, what we're doing\nis we are controlling the inputs to the\nsoft max and sort of hoping that the\noutputs are well-behaved. If we really,\nreally want to enforce\num well-behaved outputs, what we can do\nis we can kind of take the logits, the\nthings that go straight into the soft\nmax, and we can just cap them off so\nthey can never be too large or too\nsmall, right? This is a hard almost a\nhard constraint. Um it's called a soft\ncap, of course, but a tan H, you know,\nis bounded at some value.\nUm and so uh this is in the Gemma\nmodels. Um I think both Gemma or like\nGemma's two, three, and four all use the\nlogit soft cap trick. Um and what they\ndo is they take all of their logits from\nthe attention layers, and then they soft\ncap them at some value.\nUm some Nvidia folks have done actually\nquite nice work doing systematic\ncomparisons of these stability\ninterventions. Um and what they find is\num if you start with a baseline model,\nyou can do all sorts of different\ninterventions, and QK norm is here, and\nit does slightly better due to the fact\nthat you can crank up the learning rate\na little bit. Um but if you do soft\ncapping alone, you actually end up\nlosing performance. So, there is a a\nquality degradation that happens. This\nis a very strong intervention. You can\nnever express very confident uh signals\nin your soft max beyond a certain point.\nUm so, it does have some negative\nconsequences, but this is a very safe\nway of stabilizing the outputs of your\nattention. Or sorry, the the inputs to\nyour attention, the logits that go into\nthe soft max.\nSo, that's kind of the end of the\nstability components. Um I can pause for\na moment here. Um and I'll talk about\nsort of various attention things um\nafter that.\nAll right. So,\nthe last thing I want to talk about\ntoday is various interventions that you\ncan make to your attention head.\nUm and as I was saying at the beginning\nof this lecture,\num I'm only going to talk about all the\nthings that you can do to sort of dense\nall by all attention today. So, if you\nif you were interested in hearing about\nstate space models or linear time\nattention, um sadly today is not the day\nfor you.\nUm the things that I do want to talk\nabout, which are really commonly\nimplemented um attention interventions\ntoday, are uh group query attention,\nwhich really saves inference cost by\nreducing the number of heads, um and\nsparse or sliding window attention, um\nwhich really originally came from the\nGPT-3-ish\nfamily, but have now really been adopted\nwidely by most models that are looking\nto do uh long context unless they're\ndoing exotic uh SSM stuff.\nSo, I'll start with um group query\nattention or GQA or MQA. Um\nthis I'm going to first set up the need\nfor these kinds of things, and then\nyou'll kind of hopefully see what the\nwhat the trick is um and why it's fairly\nnatural.\nSo, for the moment, we've been talking\nabout, you know, training and modeling\nand all these things, but like let's\ntake a pause, and now let's think about\ndeployment, right? You train this very\nbig model, and now you need to serve it\nto lots of users, and you're going to\npay a cost for serving.\nAnd you're going to have to, in abstract\nsense, pay for two different resources,\nright? You're going to have to pay for\nyour flops, right? The computation that\nyou're performing, but you also have to\npay for another thing. You have to pay\nfor your memory accesses, right? Because\nthe memory accesses are also going to\nimpact, you know, your system's\ncharacteristics, your latency, your\nutilization, right? So, you want both of\nthese things to be small.\nNow, let's think about what happens\nduring training or alternatively prefill\nwhen you're looking at your prompt where\nsomeone gives you the stuff. In this\ncase, you know, the total arithmetic\noperations you have is, you know, order\nof magnitude batch size sequence length\nhidden dim squared, right? That's\nroughly the size of things that you get.\nAnd of course, you know, we're doing\nquadratic attention, so we've got D\nsquared.\nwe've got uh\ntotal memory accesses. Like what is our\nmemory access that we have here? Um we\nhave batch times sequence length times\nuh hidden dim plus um\nthe sort of cost of the soft max, which\nhas a N squared component, and then\nwe've got a D squared component um for\nthe for the projections. So, the\narithmetic intensity here is pretty\ngood. Um\nit's going to be one over K. This is the\nnumber of heads, so you need to have um\nuh sorry, head dims. So, your head dims\nneed to be big enough that you're\nmultiplying some reasonably sized\nmatrices. And you've got a one over BN,\nso your sequences need to be long enough\nor your batch sizes need to be big\nenough. As long as both of these are\ntrue, your GPUs are going to be fully\nutilized. Great, right? You're You're\nusing all of your resources.\nNow, you know, we have done we have\nfinished training, and now we're serving\nour users. How do we serve our users?\nWe're going to generate tokens and send\nit to them, right? Now, for doing that,\num I can't parallelize the generation\nprocess. What I'm going to do is I'm\ngoing to generate a token, I'm going to\ncondition on it, I'm going to generate\nthe next token, and I'm going to repeat\nthis process one by one, right? This is\njust sort of the the curse of\nautoregressive language modeling. We\nhave to do this.\nUm in order to do this, the efficient\nway to do it is to maintain all of the\nsort of past keys and queries that I've\nhad in what's called a KV\nover the past, and then whenever I need\nto compute something new, I can reuse\nsort of the the submatrices that I've\nalready had from the past. And I only\nreally need to compute sort of the new\num query key interactions that I need to\nfill out the rest of this matrix, right?\nSo, every submatrix I've computed\nbefore, I can keep. I only need to\ncompute my new ones. So, this saves a\nlot on compute, right? That's great.\nBut,\nthe issue here is now our arithmetic\nintensity is not so good, right? As you\nmight sort of intuit, this KV cache\napproach is going to be reading and and\nreading um parameters all the time,\nright? Each time I have a new step,\nright? I'm going to have to read in my\nI'm going to have to read in my\nparameters. I'm going to have to take\nthese dot products, and I'm going to do\nthis once every step.\nAnd so, now what do I have? Well, you\nknow, my total memory Oh, sorry. My\ntotal arithmetic operations are the\nsame. I'm multiplying the same matrices\nstill, right? Just incrementally rather\nthan all at once. But because I'm doing\nthis incrementally, you know, now I have\num a a memory access pattern of batch by\nsequence squared by hidden dim plus um\nsequence by hidden dim squared. And the\nby hidden dim squared. And the\nsecond term is not so pleasant, right?\nIt used to be that it was just D\nsquared, but now we've got N times D\nsquared. And if we compute the\n. And if we compute the\narithmetic intensity, which is the ratio\nof these two guys, um now we have N over\nD plus one over B. So, now what we need\nis large batches plus short sequence\nlength, or we need really big model\ndimensions. So, if we want to serve a\nsmall model efficiently, this is not so\nRight? Um this is really difficult to\ndeal with, right? This N over D term,\nthis first term over here, which is\nsequence length over hidden dim, is very\ndifficult to reduce if we're doing this\nincremental computation. This is just a\nhard thing to deal with.\nSo, this leads to this idea of MQA or\nmulti-query attention. Normally, you\nhave multiple heads in your attention\noperation, and you're going to have\ndifferent keys, different values, and\ndifferent queries. That's normally how\nqueries. That's normally how\nyou do things.\nBut, one thing that we could do is maybe\nwe can keep the keys and the Vs the same\nacross all the heads, and the only thing\nthat's different across the heads are\nthe queries. If we do this, then this\ndrastically removes the amount of items\nthat need to be moved in and out of\nmemory, right? Because the KV cache,\nright? Are now significantly smaller.\nThese are all shared across all the\nheads. Um this significantly reduces the\ntotal memory access as well as the\narithmetic intensity, and we're kind of\nthe key term that we were talking about\nhere, we had the N over D term. Now, we\nhave uh H multiplying this, right? And\nso, this H term allows us to\nsignificantly reduce the the I sorry,\nincrease the arithmetic intensity if we\nhave a lot of heads, right? This is a\nsignificant gain over what we had\nbefore.\nSo, this gets us significant efficiency\nimprovements,\nbut the issue with MQA is\nthis is on the right here, you have one\nvalue and one key for all these queries.\nYou do in fact lose significant\nexpressive power if you do this.\nAnd so, there's this trade-off between\nsystem's efficiency and expressiveness,\nand you might wonder, is there sort of a\nsweet spot in which we can avoid trading\noff, you know, quite a significantly\nexpressive power and computation. And\nthat's where GQA or grouped query\nattention comes in. You know, the\noriginal transformer is multi-head. We\nhave queries and keys for each head. In\nmulti-query, we have one key and value\nfor each for all the heads. In grouped\nquery, we reduce the amount of keys and\nvalues, but we keep the number of\nqueries the same. So, we now have this\nratio that we can play with, which is\nkind of the number of key heads or the\nnumber of value heads\nwhile keeping the total number of heads\nmuch larger than that. So, this allows\nus to very simply control the\nthe trade-off between expressiveness and\ninference efficiency.\nUm there are other sort of tricks from\nDeepSeek-V2, multi-head latent\nattention, that I'll sort of mention\nbriefly next time,\nwhich sort of have a different kind of\nfactorization structure and a different\nset of trade-offs. But really,\nthe nice thing about GQA is that in\npractice the trade-off is quite\nfavorable. So, if you have multi-head,\nyour performance, this was you know, in\nthe I think T5 days if I remember right.\nThis is your downstream model\nperformance. This is your time per\nsample. You want to reduce this as much\nas possible.\nWith multi-head attention, you have best\nperformance but very high cost.\nWith MQA, you know, you have\nlower cost but much lower performance.\nSimilarly, if you make your model\nsmaller to try to hit your performance\ntargets, you get much worse performance.\nGQA really does get the best of both\nworlds, you know, very low inference\ncost, nearly the same performance as\nyour full multi-head.\nUm and you see sort of this like GQA\ngroup structure where if you have\na small reduction in the number of\nheads,\nyou basically have most of the gains in\nyour performance, which allows you to\nsort of keep most of the expressive\npower\nwhile getting significant inference\nimprovements. And Percy will talk a\nbunch more about sort of the inference\nmechanics later,\nbut sort of this should give you a\nflavor of like why models today almost\nall adopt this GQA structure because it\ngives you a lot of this inference cost,\nwhich is really critical, without very\nmuch of a\nexpressiveness hit.\nCool. Any questions for for GQA or KV\ncache? Yeah.\nGiven\nthat you have so many\nlike rules of thumb for what\nhyper-parameters are good, like to what\nextent are you still searching over\nhyper-parameters versus exploiting these\nrules of thumb that you\nI think it's a mix of both. I think\nevery sort of model training run has\nsome theses about what can be varied.\nAnd so, you see this in a lot of the\nreports where I think the\nhyper-parameters are often not where\npeople are touching too much.\nBut you see like architecture changes\nlike one at a time in a lot of these\nreports. But it's very rare to like go\nand change everything up. I think Google\nis one of the only orgs that seems to\nlike really spice things up in a\nsignificant way.\nThe Gemma series has done some pretty\ninteresting things.\nThe most recent Gemma 4 release, for\nexample, now has like individual\nembedding for every layer\nin a way to control the trade-offs\nbetween like memory use and flops.\nThey're very interesting set of things\nthat they've done.\nOh, yeah, back there.\nDo you experiment with like data all\nthings of these parameters\nduring training?\nDuring training,\nlet me think.\nWeight decay, yes. Weight decay, people\nchange in concert with like learning\nrate.\nThat is actually a heuristic that people\ndo that works very well.\nUm other than that, I don't know if\nthere's a lot of different hypers that\npeople change during training,\nespecially because the architecture ones\njust make training incompatible. So, you\ncan't really, you know, change them\nwhile you're you're training.\nYeah, so I think I think weight decay is\nprobably the one that I can think of.\nThe others are usually fixed.\nYeah, MQA is uh\nit's not just inference time fixed. It's\na pleasure to train. That's right, yeah.\nYou you you train with a certain number\nof keys.\nOkay. The last thing I'll talk about is\nsliding window attention, which is a\nreally old idea. Like, you know, GPT-3\nused actually this you know, if you read\nthe paper, they'll say we alternate\nbetween full attention, which where\nevery position can attend to everyone in\nthe past, and a banded matrix style\nattention where you can attend to\neveryone within a fixed window.\nAnd you know, OpenAI has some early work\non these kinds of like different kinds\nof attention patterns that you can use.\nBut actually, this has become really\nreally popular over the past year.\nThis idea of alternating, you know, the\nbig full attention and a more local\nattention actually hits a sweet spot for\nhow to manage like long context\nperformance while not paying too much\nfor inference.\nUm I think, you know, the more recent\nrevival in open models, I would maybe\nsay Cohere Command A was the first one I\nsaw do it, where they, you know, had\nthis like structure where every four\nlayers they would have a full attention\nthat attended to everything. The three\nlayers in between would use a sliding\nwindow attention that would only be able\nto look at local structure. And of\ncourse, you know, as you go up sorry, in\nthis case down cuz they ordered the the\ndiagram the other way. As you go down\nsort of these blocks, you know, you're\naggregating local information into\nglobal ones. The local attentions at the\nend can of course access more global\ninformation, but this, you know, allows\nyou to manage the\nthe the cost of having a really long\ncontext without having to go for\nsomething like a state-space model or\nmore exotic intervention. And that's\nworked quite well.\nUm there's also some innovation where\npeople change the embedding format for\nthe long range information where they\nget rid of things like rope, so you have\nno position embeddings at all. So,\nyou're really looking almost at bags\nwhere the short range information still\ngets position information. So, people do\nall sorts of you know,\nkinds of interventions involving these\nthese\nboth the embeddings and alternating\nlocal and global structure.\nUm I'll say that this is a\nyou know, attention and in general how\nto manage the trade-off between long\ncontext and sort of long context cost\nand performance is a still an active\narea of investigation. It's a place\nwhere the most architecture work and\nchanges are still being done. Um we see\nessentially a bunch of other models\nadopt this idea. Llama 4, most recently\nGemma 4, Omo 3, they all do this\ncombination of sliding window attention\nand full attention, in their case using\nfull rope instead of nope as the\nembedding.\nSo, as I said, this is becoming really\nreally popular.\nQwen 3.5, which I put on the right,\nthey're actually a little bit different\nbecause they alternate a state-space\nmodel\ncalled a gated DeltaNet and a full\nattention every sort of, you know, one\none full attention every four layer\nevery four layers.\nSo, it's the same alternating structure,\nbut they're using a different sort of\ncheap layer. In their case, they're\nusing a state-space model. I'll explain\nwhat that is next lecture instead of a\nsliding window sort of local attention.\nBut you see this is like I think a new\ntheme over the past year where, you\nknow, open models are really trying to\ngrapple with long context performance,\nand the way to do that, at least so far,\nis to have these hybrid models that\naren't just global attention, aren't\njust cheap attention. They're some sort\nof mix in between. And that's that seems\nto have worked very well so far in a lot\nof these models.\nOkay, cool. So, as I was sort of trying\nto emphasize, when you look across all\nof these models, you start to see a lot\nof patterns and hopefully a sense of\ngeneral understanding about what things\nyou can do and what things are good to\nfolks. Um we also see a lot of\ndifferences in how we handle context and\nhow we handle position embeddings.\nEven tokenization, there's some\ndifferences, right? So, there are\ndifferences across these models, but\nacross these models, but\nthere's also commonalities that\nhopefully now give you some intuition as\nyou go out and do your assignments and\nand sort of mess with the leaderboard\nand so on.\non.\nThanks.", + "fetched_at": "2026-06-22T05:14:54Z", + "source": "yt-dlp-vtt", + "clean_segments": [ + { + "start": 5.64, + "text": "So today we're going to talk about" + }, + { + "start": 6.8, + "text": "architecture, which at least to me has" + }, + { + "start": 9.44, + "text": "always been pretty inscrutable." + }, + { + "start": 12.0, + "text": "Um and so I'm going to take the approach" + }, + { + "start": 13.64, + "text": "of just telling you kind of everything," + }, + { + "start": 16.04, + "text": "right? I'm going to go through all of" + }, + { + "start": 17.24, + "text": "the modern papers." + }, + { + "start": 19.32, + "text": "Um and we're going to just look through" + }, + { + "start": 20.92, + "text": "what has everyone done? Um and so I've" + }, + { + "start": 22.88, + "text": "titled this everything you didn't want" + }, + { + "start": 24.4, + "text": "to know about architectures and" + }, + { + "start": 26.12, + "text": "hyperparameters because I think we all" + }, + { + "start": 27.8, + "text": "wished we lived in a world where the" + }, + { + "start": 29.76, + "text": "only things you had to know were like VC" + }, + { + "start": 31.36, + "text": "dimension or something, right? Like very" + }, + { + "start": 32.72, + "text": "simple, you know, theoretical tools, but" + }, + { + "start": 34.72, + "text": "that's not really where we are." + }, + { + "start": 36.96, + "text": "So okay. What we are going to do is we" + }, + { + "start": 39.96, + "text": "are going to try to understand" + }, + { + "start": 41.48, + "text": "architecture from kind of like a survey" + }, + { + "start": 43.64, + "text": "lens, right? The best thing to do, you" + }, + { + "start": 46.36, + "text": "know, better than listening to this" + }, + { + "start": 47.4, + "text": "lecture even is for you to go out and" + }, + { + "start": 49.16, + "text": "like train your own models and try" + }, + { + "start": 50.56, + "text": "different architectures, right? That's" + }, + { + "start": 51.8, + "text": "by far the best thing to do. That's part" + }, + { + "start": 53.88, + "text": "of the philosophy of the course. But" + }, + { + "start": 56.24, + "text": "we're not going to be able to cover the" + }, + { + "start": 57.68, + "text": "whole design space of all the different" + }, + { + "start": 59.4, + "text": "architectures that are out there, right?" + }, + { + "start": 61.08, + "text": "Like that's not something that we have" + }, + { + "start": 62.2, + "text": "the compute or the time to do." + }, + { + "start": 64.32, + "text": "So my opinion is the second best thing" + }, + { + "start": 66.72, + "text": "that we could do is to try to learn from" + }, + { + "start": 68.68, + "text": "the experience of others, right? What" + }, + { + "start": 70.52, + "text": "has What has everyone else done? What" + }, + { + "start": 72.28, + "text": "are the choices that they are making," + }, + { + "start": 73.64, + "text": "right? And by looking at kind of a" + }, + { + "start": 75.28, + "text": "broader, somewhat zoomed out picture," + }, + { + "start": 77.36, + "text": "maybe we can start to understand, oh," + }, + { + "start": 79.08, + "text": "these are the kinds of parameters and" + }, + { + "start": 80.84, + "text": "choices that are sort of fixed across" + }, + { + "start": 82.92, + "text": "all effective architectures and these" + }, + { + "start": 84.68, + "text": "other ones can be sort of varied without" + }, + { + "start": 86.36, + "text": "impacting" + }, + { + "start": 87.72, + "text": "how the model performs, right? So I'm" + }, + { + "start": 89.12, + "text": "going to talk about, you know, basically" + }, + { + "start": 91.2, + "text": "transformer variants. Like what is the," + }, + { + "start": 93.44, + "text": "you know, modern transformer starting" + }, + { + "start": 95.08, + "text": "with, you know, the Vaswani paper and" + }, + { + "start": 97.52, + "text": "then, you know, as we go to more modern," + }, + { + "start": 100.28, + "text": "more recent architectures, what do they" + }, + { + "start": 101.92, + "text": "have in common? And then what are we" + }, + { + "start": 103.68, + "text": "allowed to vary? Or not allowed, but" + }, + { + "start": 105.12, + "text": "what do people vary as they go through" + }, + { + "start": 107.28, + "text": "this, right?" + }, + { + "start": 108.84, + "text": "So" + }, + { + "start": 109.84, + "text": "I think many of you have taken an NLP" + }, + { + "start": 111.6, + "text": "course of some kind or at least seen a" + }, + { + "start": 113.44, + "text": "transformer, so you've probably seen," + }, + { + "start": 115.56, + "text": "you know, the very vanilla transformer" + }, + { + "start": 117.32, + "text": "from Vaswani et al. Um you know, there" + }, + { + "start": 121.56, + "text": "there are some fairly standard choices" + }, + { + "start": 123.0, + "text": "that you make. You say, oh, transformers" + }, + { + "start": 124.8, + "text": "don't have positional dependence, so" + }, + { + "start": 126.44, + "text": "we're going to add a position embedding." + }, + { + "start": 128.16, + "text": "And what do we do? We're going to add" + }, + { + "start": 129.119, + "text": "some sines and cosines." + }, + { + "start": 131.08, + "text": "Um we're going to have information" + }, + { + "start": 132.76, + "text": "processing through a ReLU. Um and then" + }, + { + "start": 135.48, + "text": "we're going to have a a post norm. I'll" + }, + { + "start": 137.12, + "text": "talk about what exactly that is later." + }, + { + "start": 139.28, + "text": "Um and when you look at your assignment," + }, + { + "start": 140.88, + "text": "your A1, you're going to notice some" + }, + { + "start": 142.84, + "text": "differences between the standard or the" + }, + { + "start": 145.12, + "text": "vanilla transformer and what we've asked" + }, + { + "start": 147.28, + "text": "you to implement. Well, we're going to" + }, + { + "start": 148.48, + "text": "ask you to move the layer norm to the" + }, + { + "start": 150.32, + "text": "front of each transformer block or the" + }, + { + "start": 152.28, + "text": "non-residual layers. We're going to ask" + }, + { + "start": 154.24, + "text": "you to implement something called rope." + }, + { + "start": 156.04, + "text": "Um" + }, + { + "start": 156.64, + "text": "and we're going to ask you to implement" + }, + { + "start": 158.48, + "text": "something called SwiGLU and not ReLU." + }, + { + "start": 161.16, + "text": "Right? Why do we pick these? Um one" + }, + { + "start": 163.6, + "text": "reason is we've, you know, copied a lot" + }, + { + "start": 165.36, + "text": "of this over from LLaMA, but so did" + }, + { + "start": 167.2, + "text": "everyone else. Really, I think if you" + }, + { + "start": 169.48, + "text": "were to train on your own language" + }, + { + "start": 170.88, + "text": "model, I think you'll quickly run into" + }, + { + "start": 172.52, + "text": "this question of, oh, there's so many" + }, + { + "start": 173.76, + "text": "choices, right? Like what do I choose" + }, + { + "start": 175.64, + "text": "for all these things?" + }, + { + "start": 177.12, + "text": "And so let's now sort of walk through" + }, + { + "start": 179.52, + "text": "all these different models." + }, + { + "start": 181.52, + "text": "The way I kind of think about" + }, + { + "start": 182.52, + "text": "architectures is to think about to look" + }, + { + "start": 185.04, + "text": "at all the different things people have" + }, + { + "start": 186.32, + "text": "done and say, what are the things that" + }, + { + "start": 188.16, + "text": "people have done? Can we pick and choose" + }, + { + "start": 189.92, + "text": "from those?" + }, + { + "start": 191.16, + "text": "Um Percy always makes fun of me for this" + }, + { + "start": 193.24, + "text": "a little bit, but you know, I try to" + }, + { + "start": 194.88, + "text": "look at all the the different models" + }, + { + "start": 196.44, + "text": "that come out each year to try to make" + }, + { + "start": 198.32, + "text": "this lecture." + }, + { + "start": 199.56, + "text": "Um and last year I thought, oh, there's" + }, + { + "start": 201.0, + "text": "just a couple papers. It's going to be" + }, + { + "start": 202.4, + "text": "fine. It's going to be fine. And then I" + }, + { + "start": 203.76, + "text": "look through all the things and there's" + }, + { + "start": 204.88, + "text": "a lot of papers. There's Qwen 2 and" + }, + { + "start": 206.56, + "text": "Gemma 3" + }, + { + "start": 207.88, + "text": "and InternLM2." + }, + { + "start": 210.2, + "text": "and then there were even more. There's" + }, + { + "start": 211.2, + "text": "like NeMo Tron 4 and Qwen 2 and oh oh my" + }, + { + "start": 213.24, + "text": "goodness, there were 19 new dense" + }, + { + "start": 215.24, + "text": "models. And so last year I had my work" + }, + { + "start": 217.24, + "text": "cut out for me." + }, + { + "start": 218.88, + "text": "And then this year, you know, I thought," + }, + { + "start": 220.6, + "text": "well, there can't be that many new LM" + }, + { + "start": 222.44, + "text": "releases. Like it's got to be slowing" + }, + { + "start": 224.0, + "text": "down, right? Like people can't keep" + }, + { + "start": 225.52, + "text": "training 20 dense LMs per year. Um and" + }, + { + "start": 227.96, + "text": "that's technically right. There aren't" + }, + { + "start": 229.0, + "text": "that as many dense LMs. Initially, you" + }, + { + "start": 230.8, + "text": "know, I was like, oh, there's Qwen 3," + }, + { + "start": 232.6, + "text": "Gemma 4 just came out last Thursday, so" + }, + { + "start": 234.36, + "text": "I put that in there. And almost 3. You" + }, + { + "start": 236.16, + "text": "know, there's only a couple. And of" + }, + { + "start": 237.64, + "text": "course I have to give a shout-out to" + }, + { + "start": 239.28, + "text": "Percy's own 8B model trained with" + }, + { + "start": 241.4, + "text": "Marine. And I was like, oh, we'll just" + }, + { + "start": 242.52, + "text": "have a few things to cover. Um but it" + }, + { + "start": 244.4, + "text": "turns out if you start looking, there's" + }, + { + "start": 245.6, + "text": "a lot of different models. Um and so the" + }, + { + "start": 248.6, + "text": "fact that we have so many different" + }, + { + "start": 250.52, + "text": "models, most of these actually are MoEs," + }, + { + "start": 252.72, + "text": "mixtures of experts, and I'll be talking" + }, + { + "start": 254.28, + "text": "about that tomorrow rather than today." + }, + { + "start": 256.16, + "text": "Um because we have such a big diversity" + }, + { + "start": 257.88, + "text": "of models, we actually get a pretty good" + }, + { + "start": 260.28, + "text": "picture of all the different choices" + }, + { + "start": 262.72, + "text": "that we can make. Um" + }, + { + "start": 264.12, + "text": "so I I made this like little table." + }, + { + "start": 265.68, + "text": "We'll come back to this little table at" + }, + { + "start": 267.16, + "text": "the end of the lecture." + }, + { + "start": 268.8, + "text": "Um but basically at this point, you" + }, + { + "start": 270.76, + "text": "know, starting with, you know, the" + }, + { + "start": 272.16, + "text": "original transformer, there's been" + }, + { + "start": 273.84, + "text": "actually quite a few autoregressive" + }, + { + "start": 276.32, + "text": "language models kind of trained on the" + }, + { + "start": 278.24, + "text": "same class of things." + }, + { + "start": 280.16, + "text": "Um and you can ask questions like, what" + }, + { + "start": 282.2, + "text": "are the different vocabulary sizes? Or" + }, + { + "start": 284.16, + "text": "what kind of layer norms do we use? Or," + }, + { + "start": 287.04, + "text": "you know, what kind of position" + }, + { + "start": 288.12, + "text": "embeddings do people use? And we see" + }, + { + "start": 289.88, + "text": "some fairly clear trends. I'll be" + }, + { + "start": 291.12, + "text": "talking about this as we go." + }, + { + "start": 293.28, + "text": "Okay." + }, + { + "start": 294.32, + "text": "Um so" + }, + { + "start": 296.88, + "text": "the goal here is that we're going to" + }, + { + "start": 298.4, + "text": "cover couple different things. We're" + }, + { + "start": 300.28, + "text": "going to cover common architecture" + }, + { + "start": 301.88, + "text": "variations. So these are different" + }, + { + "start": 303.4, + "text": "building blocks of the transformer." + }, + { + "start": 305.68, + "text": "Um and after we've established what the" + }, + { + "start": 307.36, + "text": "standard building blocks are, like, you" + }, + { + "start": 309.28, + "text": "know, what do we use for the the" + }, + { + "start": 310.84, + "text": "nonlinearities or what do we use for" + }, + { + "start": 312.44, + "text": "position embeddings, then we're going to" + }, + { + "start": 314.4, + "text": "talk about hyperparameters. We're going" + }, + { + "start": 315.96, + "text": "to go down even lower detail and say" + }, + { + "start": 317.56, + "text": "like, you know, what is FF dim? Um" + }, + { + "start": 320.32, + "text": "should we make that a multiple of four" + }, + { + "start": 322.04, + "text": "or like multiply the the hidden by four" + }, + { + "start": 324.12, + "text": "to get FF dim? How many vocab elements" + }, + { + "start": 326.48, + "text": "should I have? Um" + }, + { + "start": 328.2, + "text": "and then after that, we're going to talk" + }, + { + "start": 329.76, + "text": "about very low-level tricks of how to" + }, + { + "start": 333.04, + "text": "get models to train stably. And the" + }, + { + "start": 334.92, + "text": "reason why I'm going to talk about that" + }, + { + "start": 336.24, + "text": "in this lecture is because these" + }, + { + "start": 338.0, + "text": "stability tricks have a pretty close" + }, + { + "start": 340.4, + "text": "connection with the architecture" + }, + { + "start": 341.8, + "text": "variation, right? Um one of the things" + }, + { + "start": 344.32, + "text": "that, you know, higher level I want to" + }, + { + "start": 345.76, + "text": "sort of impress upon you is that" + }, + { + "start": 347.52, + "text": "architectures are actually a a very" + }, + { + "start": 349.68, + "text": "complex set of tradeoffs, right? Like" + }, + { + "start": 351.8, + "text": "what does a architecture have to do?" + }, + { + "start": 353.84, + "text": "Well, it has to learn from data, so it" + }, + { + "start": 355.44, + "text": "has to generalize. It has to train" + }, + { + "start": 357.28, + "text": "efficiently on GPUs. And it has to not" + }, + { + "start": 360.28, + "text": "blow up, right? Like halfway through" + }, + { + "start": 361.36, + "text": "training, if your, you know, training" + }, + { + "start": 363.84, + "text": "losses just go like down like this and" + }, + { + "start": 365.4, + "text": "then suddenly blow up, that's no good at" + }, + { + "start": 367.16, + "text": "all, right? So all these different" + }, + { + "start": 368.8, + "text": "requirements end up getting baked" + }, + { + "start": 370.88, + "text": "straight into the architecture. And" + }, + { + "start": 372.24, + "text": "that's why these things are a little bit" + }, + { + "start": 373.72, + "text": "messy and a little bit complex." + }, + { + "start": 375.96, + "text": "Um but you should keep that in mind and" + }, + { + "start": 377.56, + "text": "that's why, you know, things aren't in" + }, + { + "start": 378.84, + "text": "many ways not so elegant." + }, + { + "start": 382.28, + "text": "So we're going to start with the core" + }, + { + "start": 384.56, + "text": "architecture piece. And as a high-level" + }, + { + "start": 386.96, + "text": "view, you know, I think the the way that" + }, + { + "start": 389.44, + "text": "I see a lot of the architecture stuff," + }, + { + "start": 391.0, + "text": "you know, looking basically" + }, + { + "start": 392.16, + "text": "historically, is kind of in the early" + }, + { + "start": 394.84, + "text": "days of, you know, starting with the" + }, + { + "start": 396.12, + "text": "transformer until, you know, GPT-3 or" + }, + { + "start": 398.64, + "text": "so, there's a lot of experimentation" + }, + { + "start": 400.48, + "text": "that happens. People try lots of" + }, + { + "start": 401.8, + "text": "different things. There's no like gold" + }, + { + "start": 403.68, + "text": "standard that everyone has unified on." + }, + { + "start": 405.88, + "text": "And then, you know, LLaMA 2 comes out" + }, + { + "start": 407.8, + "text": "and everyone's like, wow, LLaMA 2 is" + }, + { + "start": 409.04, + "text": "great. I want my own LLaMA 2. And so" + }, + { + "start": 411.6, + "text": "everyone starts training LLaMA 2-alikes" + }, + { + "start": 413.72, + "text": "with, you know, minor variation" + }, + { + "start": 416.48, + "text": "that people have. And then finally, you" + }, + { + "start": 418.96, + "text": "know, last year we saw really big" + }, + { + "start": 420.96, + "text": "differences or or sort of a trend" + }, + { + "start": 423.0, + "text": "towards architecture modifications that" + }, + { + "start": 425.2, + "text": "make training more stable. And this year" + }, + { + "start": 427.8, + "text": "we see lots of trends towards" + }, + { + "start": 430.12, + "text": "architecture variations that enable" + }, + { + "start": 432.04, + "text": "longer context dependence. So there are" + }, + { + "start": 434.64, + "text": "these big themes that are happening, but" + }, + { + "start": 436.24, + "text": "really I think, you know, you see this" + }, + { + "start": 437.88, + "text": "like big point when LLaMA 2 comes out" + }, + { + "start": 439.84, + "text": "and everyone's like, wow, I want to" + }, + { + "start": 440.8, + "text": "train something with that. And then" + }, + { + "start": 442.0, + "text": "suddenly, or not suddenly, but after" + }, + { + "start": 443.84, + "text": "that, people are starting to explore" + }, + { + "start": 445.12, + "text": "once again. So it's kind of cool to see" + }, + { + "start": 447.12, + "text": "all these different changes." + }, + { + "start": 450.56, + "text": "Um I think people can disagree about a" + }, + { + "start": 453.16, + "text": "lot of things on architectures, but" + }, + { + "start": 455.16, + "text": "there is one thing that everyone agrees" + }, + { + "start": 457.24, + "text": "on, you know, like if you take the" + }, + { + "start": 458.84, + "text": "transformer paper, I think a lot of" + }, + { + "start": 460.88, + "text": "people will say like the transformer" + }, + { + "start": 462.24, + "text": "people got like most of the things" + }, + { + "start": 464.07, + "text": "got like most of the things" + }, + { + "start": 464.08, + "text": "right, except this. And the thing that" + }, + { + "start": 466.48, + "text": "they really did not get right or like I" + }, + { + "start": 468.36, + "text": "think most people agree they did not get" + }, + { + "start": 470.32, + "text": "right is where you put the layer norm," + }, + { + "start": 472.88, + "text": "right? So in the original um" + }, + { + "start": 475.91, + "text": "? So in the original um" + }, + { + "start": 475.92, + "text": "uh transformer paper, you know, the" + }, + { + "start": 477.72, + "text": "layer norm goes in what you would call" + }, + { + "start": 479.56, + "text": "the residual path, right? So in the" + }, + { + "start": 481.32, + "text": "transformer, you know, you have the" + }, + { + "start": 482.76, + "text": "residual stream, this X that kind of" + }, + { + "start": 484.52, + "text": "runs through the whole network. And" + }, + { + "start": 491.76, + "text": "sort of a delta back into this residual" + }, + { + "start": 494.48, + "text": "stream." + }, + { + "start": 495.6, + "text": "And then in order to make sure that" + }, + { + "start": 497.44, + "text": "these gradients are sort of stable" + }, + { + "start": 499.24, + "text": "across layers, you know, a layer norm is" + }, + { + "start": 501.44, + "text": "placed at the end of each of these" + }, + { + "start": 503.28, + "text": "components." + }, + { + "start": 504.92, + "text": "Now, um" + }, + { + "start": 506.64, + "text": "instead of putting the layer norms in" + }, + { + "start": 508.6, + "text": "the residual stream, there's an" + }, + { + "start": 509.88, + "text": "alternative. Um I'll refer to this as" + }, + { + "start": 512.12, + "text": "pre-norm," + }, + { + "start": 513.64, + "text": "um in which you can put the layer norm" + }, + { + "start": 516.159, + "text": "outside of the residual stream, but" + }, + { + "start": 518.24, + "text": "before each of the computations. So you" + }, + { + "start": 520.12, + "text": "can put it before the multi-head" + }, + { + "start": 521.32, + "text": "attention, you can put it before the" + }, + { + "start": 522.68, + "text": "FFN, right? Um we'll call this pre-norm." + }, + { + "start": 526.28, + "text": "the nomenclature will get a little bit" + }, + { + "start": 527.68, + "text": "confusing. Um You can call this uh" + }, + { + "start": 530.4, + "text": "post-norm for now, but let's call this" + }, + { + "start": 532.16, + "text": "sort of residual norm, right? Cuz you're" + }, + { + "start": 533.84, + "text": "putting the norm in the residual layer." + }, + { + "start": 536.24, + "text": "Um basically all modern language models" + }, + { + "start": 539.84, + "text": "uh push the layer norm outside of the" + }, + { + "start": 542.12, + "text": "residual stream. This is just like a" + }, + { + "start": 543.6, + "text": "thing that basically everybody does." + }, + { + "start": 546.56, + "text": "Um there is one funny exception, but it" + }, + { + "start": 549.52, + "text": "is OPT-350M." + }, + { + "start": 551.4, + "text": "And if you all are familiar with sort of" + }, + { + "start": 553.44, + "text": "language models, we kind of know OPT in" + }, + { + "start": 555.88, + "text": "general was, you know, kind of a mess of" + }, + { + "start": 558.32, + "text": "a language model, right? And OPT-350M" + }, + { + "start": 561.32, + "text": "um is even more so because I don't know" + }, + { + "start": 562.96, + "text": "why only that model uh has a post layer" + }, + { + "start": 566.72, + "text": "norm in the residual stream." + }, + { + "start": 568.92, + "text": "Okay. So this is one of the things that" + }, + { + "start": 570.84, + "text": "like everyone agrees on. And so you" + }, + { + "start": 572.8, + "text": "might wonder like why is this like such" + }, + { + "start": 575.04, + "text": "a, you know," + }, + { + "start": 576.8, + "text": "uh like a uni- unified thing across all" + }, + { + "start": 579.8, + "text": "the different models?" + }, + { + "start": 582.28, + "text": "Um and if you look at some of the early" + }, + { + "start": 584.36, + "text": "works studying like where do you place" + }, + { + "start": 586.6, + "text": "the layer norm style research, um what" + }, + { + "start": 589.44, + "text": "you really see is that, you know, the" + }, + { + "start": 592.28, + "text": "early motivation for a lot of this was" + }, + { + "start": 595.28, + "text": "when you train a transformer, you need" + }, + { + "start": 596.84, + "text": "to do a warm-up. Actually, you know," + }, + { + "start": 598.48, + "text": "modern transformer training still does" + }, + { + "start": 600.04, + "text": "warm-ups as well. But you definitely" + }, + { + "start": 602.28, + "text": "need to do warm-up when you train." + }, + { + "start": 604.56, + "text": "Now, wouldn't it be nice if we could" + }, + { + "start": 606.44, + "text": "remove the warm-up, right? So, that was" + }, + { + "start": 608.28, + "text": "kind of the initial motivation for a lot" + }, + { + "start": 609.8, + "text": "of this research." + }, + { + "start": 611.32, + "text": "But people quickly realized that" + }, + { + "start": 612.8, + "text": "removing the warm-up had very serious" + }, + { + "start": 615.16, + "text": "issues in terms of the stability and" + }, + { + "start": 616.92, + "text": "convergence" + }, + { + "start": 618.32, + "text": "of these things, right? So, if you did" + }, + { + "start": 620.24, + "text": "post norm plus layer norm, which is, you" + }, + { + "start": 622.32, + "text": "know, basically the original transformer" + }, + { + "start": 623.92, + "text": "thing, you got this purple dash line." + }, + { + "start": 625.84, + "text": "Oh, you just don't converge as well" + }, + { + "start": 628.56, + "text": "compared to doing something like pre" + }, + { + "start": 630.56, + "text": "norm. You can ignore the other terms." + }, + { + "start": 632.92, + "text": "You would get much nicer convergence" + }, + { + "start": 634.68, + "text": "even without warm-up, right? So, this" + }, + { + "start": 636.16, + "text": "was the original motivation." + }, + { + "start": 638.64, + "text": "But really what people kind of quickly" + }, + { + "start": 640.4, + "text": "realized is that, you know, moving the" + }, + { + "start": 643.64, + "text": "layer norms outside the residual stream" + }, + { + "start": 645.92, + "text": "has some pretty important implications" + }, + { + "start": 648.24, + "text": "as you make your network deeper and as" + }, + { + "start": 650.8, + "text": "you start to grapple with stability" + }, + { + "start": 652.76, + "text": "issues." + }, + { + "start": 654.4, + "text": "To me, I think the gradient attenuation" + }, + { + "start": 656.32, + "text": "issues are kind of the most clear." + }, + { + "start": 660.28, + "text": "When you sort of talk to people who do" + }, + { + "start": 662.12, + "text": "architecture design, I'm not really one" + }, + { + "start": 664.64, + "text": "of the the people that deeply engages in" + }, + { + "start": 666.44, + "text": "this. But one of the things that people" + }, + { + "start": 667.72, + "text": "often say is keep your residual stream" + }, + { + "start": 670.04, + "text": "clean, right? So, in this case, you have" + }, + { + "start": 672.04, + "text": "your X's coming in from the bottom on" + }, + { + "start": 673.76, + "text": "the pre norm side, and this X propagates" + }, + { + "start": 676.52, + "text": "all the way up to the top, right? All" + }, + { + "start": 677.92, + "text": "the way up to your your final output." + }, + { + "start": 680.04, + "text": "And that allows gradients to propagate" + }, + { + "start": 681.8, + "text": "if you in the backward pass straight" + }, + { + "start": 683.76, + "text": "through this, right?" + }, + { + "start": 685.6, + "text": "Um that makes gradient propagation very" + }, + { + "start": 687.28, + "text": "simple," + }, + { + "start": 688.6, + "text": "which improves both stability and signal" + }, + { + "start": 691.56, + "text": "propagation. And that's sort of what" + }, + { + "start": 693.32, + "text": "people realized very, very quickly, that" + }, + { + "start": 695.2, + "text": "if you do something like pre norm in" + }, + { + "start": 698.04, + "text": "blue" + }, + { + "start": 699.16, + "text": "initialization, sort of the gradient" + }, + { + "start": 701.44, + "text": "size is kind of remains the same, right?" + }, + { + "start": 703.6, + "text": "Because you have this nice straight" + }, + { + "start": 705.0, + "text": "through propagation in the backward" + }, + { + "start": 706.44, + "text": "pass. On the other hand, if you have" + }, + { + "start": 708.36, + "text": "post layer norm, you have these kind of" + }, + { + "start": 710.64, + "text": "complicated effects that happen because" + }, + { + "start": 713.08, + "text": "you're layer norming each time you're" + }, + { + "start": 715.16, + "text": "going through a transformer block. And" + }, + { + "start": 716.68, + "text": "that's going to change the norm of your" + }, + { + "start": 717.839, + "text": "gradients as you go backwards through." + }, + { + "start": 720.72, + "text": "So, you can kind of see," + }, + { + "start": 723.64, + "text": "you know, from the principle of keep" + }, + { + "start": 725.32, + "text": "your residual stream clean, that pre" + }, + { + "start": 727.36, + "text": "norm makes a lot of sense." + }, + { + "start": 729.4, + "text": "People also realized through" + }, + { + "start": 730.92, + "text": "experimentation that this also improves" + }, + { + "start": 732.96, + "text": "stability in general, that the sizes and" + }, + { + "start": 735.56, + "text": "frequencies of gradient spikes" + }, + { + "start": 738.64, + "text": "were improved under pre norm compared to" + }, + { + "start": 741.24, + "text": "post norm. And, you know, this is a" + }, + { + "start": 742.72, + "text": "figure from Salazar and UN, who were one" + }, + { + "start": 744.96, + "text": "of the first ones, I think, to to study" + }, + { + "start": 746.8, + "text": "this phenomena carefully." + }, + { + "start": 748.76, + "text": "I think this is the reason why it stuck" + }, + { + "start": 750.32, + "text": "around, right? Stability and the ability" + }, + { + "start": 753.12, + "text": "to go deep are both very, very important" + }, + { + "start": 755.48, + "text": "for modern large language models. And" + }, + { + "start": 757.92, + "text": "so, this idea of moving your layer norm" + }, + { + "start": 760.0, + "text": "outside of the residual stream is one" + }, + { + "start": 762.08, + "text": "that basically everyone has adopted." + }, + { + "start": 765.48, + "text": "Um so now, you know, if putting layer" + }, + { + "start": 768.64, + "text": "norms in residual streams is bad," + }, + { + "start": 772.36, + "text": "why does layer norm have to be at the" + }, + { + "start": 774.2, + "text": "start? Of course, we have pre norm," + }, + { + "start": 775.8, + "text": "which is, you know, before our" + }, + { + "start": 777.0, + "text": "computation, but we could have it after" + }, + { + "start": 778.68, + "text": "computation as well, right? That's" + }, + { + "start": 779.95, + "text": "as well, right? That's" + }, + { + "start": 779.96, + "text": "equally good at least under that" + }, + { + "start": 781.32, + "text": "knowledge logic." + }, + { + "start": 782.96, + "text": "Um and that's exactly right. Many recent" + }, + { + "start": 786.44, + "text": "models like Grok or Gemma 2 or Olmo 2" + }, + { + "start": 790.16, + "text": "have the structure where they moved the" + }, + { + "start": 791.839, + "text": "layer norm after the computation. So," + }, + { + "start": 793.92, + "text": "it's a post norm of a kind, but it's" + }, + { + "start": 795.48, + "text": "outside the residual stream." + }, + { + "start": 797.4, + "text": "Other models still actually just put" + }, + { + "start": 799.0, + "text": "layer norms everywhere. They put a layer" + }, + { + "start": 800.24, + "text": "norm here, they put a layer norm after." + }, + { + "start": 803.2, + "text": "I'll get to this later as we talk about" + }, + { + "start": 805.0, + "text": "stability, but one of the other lessons" + }, + { + "start": 807.4, + "text": "that it seems to have held up very well" + }, + { + "start": 809.48, + "text": "is if you have stability issues, you can" + }, + { + "start": 811.16, + "text": "kind of sprinkle in layer norms" + }, + { + "start": 812.68, + "text": "everywhere, and that will generally" + }, + { + "start": 814.48, + "text": "improve stability." + }, + { + "start": 816.24, + "text": "It's almost very strange to be saying" + }, + { + "start": 817.8, + "text": "this cuz it's so ridiculous. And yet," + }, + { + "start": 819.6, + "text": "that statement has actually been proven" + }, + { + "start": 821.16, + "text": "right. Every time, you know, people have" + }, + { + "start": 823.0, + "text": "encountered stability issues, they say," + }, + { + "start": 824.8, + "text": "\"Oh, but what if we just throw a layer" + }, + { + "start": 826.32, + "text": "norm into attention?\" Turns out that" + }, + { + "start": 828.2, + "text": "works, too. We'll get to that later. So," + }, + { + "start": 829.68, + "text": "okay, that's post norm" + }, + { + "start": 831.28, + "text": "or double norm in this case where you" + }, + { + "start": 832.6, + "text": "have two layer norms here." + }, + { + "start": 835.08, + "text": "Okay. The other thing that you can do is" + }, + { + "start": 838.04, + "text": "in the original transformer, you have" + }, + { + "start": 840.12, + "text": "the layer norm, which is this operation" + }, + { + "start": 841.92, + "text": "right here. So, you have your" + }, + { + "start": 843.08, + "text": "activations X." + }, + { + "start": 844.76, + "text": "You're going to mean subtract, divide" + }, + { + "start": 846.12, + "text": "the variance, and then scale it back up," + }, + { + "start": 847.96, + "text": "right? And this works just fine, right?" + }, + { + "start": 849.96, + "text": "It's not like this is wrong. And many" + }, + { + "start": 852.04, + "text": "models have successfully trained on this" + }, + { + "start": 853.839, + "text": "scheme." + }, + { + "start": 855.0, + "text": "But basically, most or all modern" + }, + { + "start": 857.8, + "text": "models, I think," + }, + { + "start": 859.64, + "text": "use RMS norm, which doesn't subtract the" + }, + { + "start": 861.839, + "text": "mean or add a bias term, right? So, it's" + }, + { + "start": 864.24, + "text": "just a scaling down and scaling back up," + }, + { + "start": 866.72, + "text": "right? So, you can see this in the" + }, + { + "start": 867.52, + "text": "equation here." + }, + { + "start": 868.76, + "text": "Um and really, layer norm is more" + }, + { + "start": 871.6, + "text": "expressive than RMS norm. So, there's" + }, + { + "start": 873.36, + "text": "really representationally no reason why" + }, + { + "start": 876.16, + "text": "you have to use RMS norm." + }, + { + "start": 878.28, + "text": "But RMS norm is nice because in" + }, + { + "start": 880.72, + "text": "practice," + }, + { + "start": 882.4, + "text": "there's really no expressiveness loss." + }, + { + "start": 884.44, + "text": "RMS norm models just as well as layer" + }, + { + "start": 886.64, + "text": "norm. But more importantly, it is, you" + }, + { + "start": 889.52, + "text": "know, faster, right? This is the part" + }, + { + "start": 891.24, + "text": "where kind of the systems and sort of" + }, + { + "start": 893.4, + "text": "architecture co-design starts to come" + }, + { + "start": 895.2, + "text": "in." + }, + { + "start": 896.12, + "text": "Percy mentioned, you know, in the" + }, + { + "start": 897.72, + "text": "previous lecture, this idea of" + }, + { + "start": 899.04, + "text": "arithmetic intensity, right? We want to" + }, + { + "start": 900.72, + "text": "keep our GPUs hot by doing, you know," + }, + { + "start": 903.079, + "text": "matrix multiplies and other very intense" + }, + { + "start": 905.079, + "text": "computations. We do not want to be" + }, + { + "start": 907.16, + "text": "wasting our GPUs by having them move" + }, + { + "start": 909.36, + "text": "little tiny bits of memory back and" + }, + { + "start": 911.16, + "text": "forth, right? That's a very inefficient" + }, + { + "start": 912.76, + "text": "use of our, you know, very powerful GPU." + }, + { + "start": 916.04, + "text": "And so, what we want is to remove" + }, + { + "start": 918.4, + "text": "operations that are small and involve" + }, + { + "start": 920.52, + "text": "memory movement, but don't give us much" + }, + { + "start": 922.959, + "text": "expressive power, right? So, by that" + }, + { + "start": 924.92, + "text": "view, what we really want to be doing" + }, + { + "start": 926.92, + "text": "here is, you know, if the mean" + }, + { + "start": 928.48, + "text": "subtraction and addition isn't really" + }, + { + "start": 930.72, + "text": "doing much for us, just get rid of it," + }, + { + "start": 932.56, + "text": "right? Um" + }, + { + "start": 934.4, + "text": "you might think, \"Okay, why does this" + }, + { + "start": 936.68, + "text": "matter? We're just optimizing this" + }, + { + "start": 938.2, + "text": "teeny, tiny operation that accounts for," + }, + { + "start": 941.44, + "text": "you know, in this case, something like" + }, + { + "start": 943.2, + "text": "0.17%" + }, + { + "start": 945.079, + "text": "of the total floating point operations" + }, + { + "start": 947.52, + "text": "of our system.\"" + }, + { + "start": 949.12, + "text": "But, you know, as Percy mentioned, it's" + }, + { + "start": 951.88, + "text": "not really about the flops, right? The" + }, + { + "start": 953.56, + "text": "flops are the the floating point" + }, + { + "start": 955.16, + "text": "operations we do, that's sort of" + }, + { + "start": 956.52, + "text": "multiplying matrices, but that's not" + }, + { + "start": 958.56, + "text": "runtime, right? Runtime is a much more" + }, + { + "start": 961.04, + "text": "complicated object. And, you know," + }, + { + "start": 963.64, + "text": "statistical normalizations, things like" + }, + { + "start": 965.6, + "text": "layer norms, even though they're only" + }, + { + "start": 967.24, + "text": "0.17% of the flops, depending on your" + }, + { + "start": 970.04, + "text": "workload and depending on the setup, can" + }, + { + "start": 971.88, + "text": "be up to 25% of the runtime, right?" + }, + { + "start": 973.959, + "text": "That's kind of crazy. On tiny models," + }, + { + "start": 975.76, + "text": "this can be really, really big because" + }, + { + "start": 977.76, + "text": "you're still having to move all these" + }, + { + "start": 979.36, + "text": "parameters back and forth from fast to" + }, + { + "start": 981.24, + "text": "slow memory and vice versa" + }, + { + "start": 983.36, + "text": "when you're doing these operations. So," + }, + { + "start": 985.2, + "text": "data movement is really, really" + }, + { + "start": 986.52, + "text": "important, and RMS norm can still matter" + }, + { + "start": 989.28, + "text": "a lot because of this, right? So," + }, + { + "start": 992.079, + "text": "um" + }, + { + "start": 992.64, + "text": "you can see kind of the difference here." + }, + { + "start": 994.959, + "text": "The arithmetic intensity is in white," + }, + { + "start": 997.32, + "text": "and then you can kind of see the flops" + }, + { + "start": 999.2, + "text": "involved in black. And you see that" + }, + { + "start": 1000.8, + "text": "layer norm has a very low arithmetic" + }, + { + "start": 1002.68, + "text": "intensity, which is the operation we try" + }, + { + "start": 1004.6, + "text": "to want to remove as much as possible." + }, + { + "start": 1006.4, + "text": "Yeah, question over there." + }, + { + "start": 1008.48, + "text": "Data movement for normalization is so" + }, + { + "start": 1010.92, + "text": "disproportionate compared to arithmetic" + }, + { + "start": 1012.72, + "text": "contraction" + }, + { + "start": 1014.52, + "text": "So, for a ten- Something like tensor" + }, + { + "start": 1016.44, + "text": "contraction, which is, in this case," + }, + { + "start": 1018.24, + "text": "matrix multiplies," + }, + { + "start": 1019.92, + "text": "the majority of the workload is, you" + }, + { + "start": 1022.0, + "text": "know, multiplying. Whereas for stat" + }, + { + "start": 1024.52, + "text": "normalization, the majority of the" + }, + { + "start": 1025.679, + "text": "workload is memory movement. And memory" + }, + { + "start": 1027.439, + "text": "movement is quite slow. So, imagine the" + }, + { + "start": 1029.079, + "text": "case where like moving something is like" + }, + { + "start": 1031.52, + "text": "almost all of the compute, then you're" + }, + { + "start": 1033.28, + "text": "still paying quite a bit here, right?" + }, + { + "start": 1034.6, + "text": "Cuz activations can be quite large." + }, + { + "start": 1038.36, + "text": "Yeah, I think the percent runtime in" + }, + { + "start": 1039.72, + "text": "this case is quite extreme. This is like" + }, + { + "start": 1041.48, + "text": "in the, you know, tiny models with like" + }, + { + "start": 1043.52, + "text": "matrices that don't really generally" + }, + { + "start": 1045.04, + "text": "make sense in modern workloads, but this" + }, + { + "start": 1046.679, + "text": "is, you know, giving you a sense of why" + }, + { + "start": 1048.6, + "text": "this is a free optimization." + }, + { + "start": 1051.0, + "text": "Um and you do see this, right? This is" + }, + { + "start": 1052.92, + "text": "another paper in which people were" + }, + { + "start": 1055.08, + "text": "evaluating different architecture" + }, + { + "start": 1056.4, + "text": "interventions. They're on Get All in" + }, + { + "start": 1057.6, + "text": "2020." + }, + { + "start": 1058.92, + "text": "I think this was a Google paper, and" + }, + { + "start": 1060.28, + "text": "they show, you know, for teeny, tiny" + }, + { + "start": 1061.56, + "text": "transformer of a 200 million parameters," + }, + { + "start": 1063.76, + "text": "you got more steps per second. That's" + }, + { + "start": 1065.32, + "text": "the third column over here when you" + }, + { + "start": 1066.52, + "text": "switch to RMS norm. And in fact, you" + }, + { + "start": 1068.6, + "text": "actually get better performance, which I" + }, + { + "start": 1070.0, + "text": "don't think is something that you're" + }, + { + "start": 1071.36, + "text": "guaranteed, but it's a nice bonus" + }, + { + "start": 1073.48, + "text": "regardless, right? So, you got a free" + }, + { + "start": 1075.36, + "text": "systems win by just moving to RMS norm." + }, + { + "start": 1077.92, + "text": "And so, basically, everyone has has" + }, + { + "start": 1079.24, + "text": "decided to move over to this now." + }, + { + "start": 1082.6, + "text": "And in general," + }, + { + "start": 1084.48, + "text": "there's a more general version of this." + }, + { + "start": 1086.56, + "text": "There's no guarantee to any of the" + }, + { + "start": 1087.8, + "text": "things I'm saying, but bias terms in" + }, + { + "start": 1090.8, + "text": "transformers and neural networks" + }, + { + "start": 1092.88, + "text": "are generally not that useful. So, in" + }, + { + "start": 1094.96, + "text": "the original transformer, the linear" + }, + { + "start": 1096.44, + "text": "terms all have biases," + }, + { + "start": 1099.0, + "text": "but most implementations actually just" + }, + { + "start": 1101.28, + "text": "drop the biases entirely, right? Once" + }, + { + "start": 1103.2, + "text": "again, this is another example of" + }, + { + "start": 1105.08, + "text": "something that's not very arithmetically" + }, + { + "start": 1106.72, + "text": "intense, but fairly memory" + }, + { + "start": 1108.88, + "text": "intensive, relatively speaking. And so," + }, + { + "start": 1111.23, + "text": "ive, relatively speaking. And so," + }, + { + "start": 1111.24, + "text": "you might as well just drop these," + }, + { + "start": 1112.88, + "text": "right? And get the free systems win." + }, + { + "start": 1115.48, + "text": "There's also some cases, I'll just" + }, + { + "start": 1116.919, + "text": "mention this offhand, where the bias" + }, + { + "start": 1118.6, + "text": "terms can also induce stability issues." + }, + { + "start": 1120.88, + "text": "So, they're useful in other ways, but" + }, + { + "start": 1122.84, + "text": "really, I think the primary reason these" + }, + { + "start": 1124.4, + "text": "are dropped is just to simplify things" + }, + { + "start": 1126.6, + "text": "from the systems perspective." + }, + { + "start": 1129.32, + "text": "Cool." + }, + { + "start": 1131.64, + "text": "Okay. So, I think layer norms, the story" + }, + { + "start": 1134.12, + "text": "is pretty easy." + }, + { + "start": 1135.6, + "text": "Um it's easy in the sense that what" + }, + { + "start": 1137.2, + "text": "people do is fairly standardized. Our" + }, + { + "start": 1139.679, + "text": "our understanding, not at like a deep" + }, + { + "start": 1141.8, + "text": "theoretical level, but our understanding" + }, + { + "start": 1143.36, + "text": "of like what layer norm does is fairly" + }, + { + "start": 1146.52, + "text": "good, right? Everyone moves the layer" + }, + { + "start": 1148.48, + "text": "norm outside the residual stream," + }, + { + "start": 1150.96, + "text": "often pre norm, but I think this might" + }, + { + "start": 1152.6, + "text": "partially be be because Llama 2 did" + }, + { + "start": 1154.56, + "text": "that. Um and we roughly have a sense of" + }, + { + "start": 1157.76, + "text": "how to use layer norm to control things" + }, + { + "start": 1159.72, + "text": "like" + }, + { + "start": 1160.84, + "text": "gradient spikes" + }, + { + "start": 1162.8, + "text": "and keep signal propagation nice." + }, + { + "start": 1165.44, + "text": "Related to that, we also now, you know," + }, + { + "start": 1167.72, + "text": "basically always use RMS norm, and you" + }, + { + "start": 1169.48, + "text": "hopefully understand the general" + }, + { + "start": 1170.72, + "text": "principles here" + }, + { + "start": 1172.32, + "text": "of basically just dropping bias terms." + }, + { + "start": 1174.84, + "text": "And that allows us to to keep our system" + }, + { + "start": 1177.24, + "text": "more arithmetically intense" + }, + { + "start": 1179.6, + "text": "while keeping the expressive power the" + }, + { + "start": 1181.0, + "text": "same." + }, + { + "start": 1182.2, + "text": "Um I think the unsatisfying thing about" + }, + { + "start": 1184.6, + "text": "a lot of architectures is that, you" + }, + { + "start": 1186.679, + "text": "know, you can't really reason about this" + }, + { + "start": 1188.32, + "text": "beforehand, right? Like we don't know" + }, + { + "start": 1189.72, + "text": "beforehand that dropping the bias terms" + }, + { + "start": 1191.43, + "text": "that dropping the bias terms" + }, + { + "start": 1191.44, + "text": "is okay, but from a lot of" + }, + { + "start": 1193.08, + "text": "experimentation and now collectively" + }, + { + "start": 1194.76, + "text": "acquired knowledge, we roughly know that" + }, + { + "start": 1197.04, + "text": "dropping the bias terms on both the" + }, + { + "start": 1198.68, + "text": "linear and RMS norm is okay for typical" + }, + { + "start": 1201.24, + "text": "language modeling workloads, right? Um" + }, + { + "start": 1203.12, + "text": "this is the kind of statement that we" + }, + { + "start": 1204.36, + "text": "can make on the basis of of what we do" + }, + { + "start": 1206.44, + "text": "when we look at a variety of different" + }, + { + "start": 1207.96, + "text": "models." + }, + { + "start": 1209.64, + "text": "Okay, any questions uh for layer norm" + }, + { + "start": 1211.68, + "text": "stuff?" + }, + { + "start": 1213.92, + "text": "Good. Okay. So, now I'm going to talk" + }, + { + "start": 1216.12, + "text": "about activations. Um" + }, + { + "start": 1218.24, + "text": "and there's a whole zoo of activations." + }, + { + "start": 1220.24, + "text": "There's just a lot, right? Like ReLU," + }, + { + "start": 1221.84, + "text": "GELU, Swish ELU, GeGLU," + }, + { + "start": 1225.0, + "text": "SeLU, SwiGLU, LiGLU." + }, + { + "start": 1227.24, + "text": "Um and what are these things?" + }, + { + "start": 1229.0, + "text": "Um I think at what point one point of" + }, + { + "start": 1231.12, + "text": "my, you know, more stats ML training, I" + }, + { + "start": 1233.92, + "text": "thought to myself I will never learn" + }, + { + "start": 1235.4, + "text": "these things. I will make it a point of" + }, + { + "start": 1236.8, + "text": "pride to never know what a SwiGLU is. Um" + }, + { + "start": 1238.92, + "text": "but now it's actually very important for" + }, + { + "start": 1240.72, + "text": "us to to actually like have a general" + }, + { + "start": 1242.76, + "text": "sense of what these objects are um and" + }, + { + "start": 1245.04, + "text": "which parts of these names actually kind" + }, + { + "start": 1247.0, + "text": "of matter for performance, right?" + }, + { + "start": 1250.08, + "text": "Um So, you can build and train a" + }, + { + "start": 1253.56, + "text": "language model on just a fairly vanilla" + }, + { + "start": 1256.44, + "text": "activation. Um" + }, + { + "start": 1258.4, + "text": "even, you know, I guess Chinchilla is" + }, + { + "start": 1260.36, + "text": "probably the best model out of that" + }, + { + "start": 1261.96, + "text": "group, but even if you just want a ReLU," + }, + { + "start": 1264.24, + "text": "you know, you can train a reasonably" + }, + { + "start": 1265.64, + "text": "performant language model using just" + }, + { + "start": 1267.4, + "text": "that activation. There's nothing wrong" + }, + { + "start": 1268.96, + "text": "with that, right? And if we move to" + }, + { + "start": 1270.68, + "text": "GELU, which is a Gaussian error unit," + }, + { + "start": 1272.92, + "text": "and really the only difference is this" + }, + { + "start": 1274.4, + "text": "tiny divot at the bottom here, which" + }, + { + "start": 1277.08, + "text": "really, you know, for the most of the" + }, + { + "start": 1279.04, + "text": "activation doesn't change anything, but" + }, + { + "start": 1280.48, + "text": "changes the gradients right near zero," + }, + { + "start": 1282.68, + "text": "um then you can train models like GPT-3," + }, + { + "start": 1285.2, + "text": "right? That's a perfectly good large" + }, + { + "start": 1286.76, + "text": "language model, not, you know, modern by" + }, + { + "start": 1288.84, + "text": "modern standards, but perfectly fine." + }, + { + "start": 1291.68, + "text": "Um but then, you know, we get to the" + }, + { + "start": 1293.56, + "text": "gated linear units like SwiGLU and" + }, + { + "start": 1295.44, + "text": "GeGLU, and these are really where most" + }, + { + "start": 1297.68, + "text": "of the action is. You know, this is very" + }, + { + "start": 1299.48, + "text": "similar to layer norm in that I think" + }, + { + "start": 1302.24, + "text": "almost all credible modern language" + }, + { + "start": 1305.44, + "text": "models use a gated linear unit of some" + }, + { + "start": 1308.08, + "text": "kind." + }, + { + "start": 1309.88, + "text": "Okay, so what is a gated linear unit?" + }, + { + "start": 1312.44, + "text": "So, these are gated activations. So, if" + }, + { + "start": 1314.68, + "text": "we want to look at something like a feed" + }, + { + "start": 1316.36, + "text": "forward layer, um we can just look at" + }, + { + "start": 1318.32, + "text": "this first part. This is, you know, a" + }, + { + "start": 1319.6, + "text": "very standard ReLU feed forward, right?" + }, + { + "start": 1321.96, + "text": "I have my X, I hit it with a W1, you" + }, + { + "start": 1324.8, + "text": "know, I I entry-wise threshold at zero," + }, + { + "start": 1326.96, + "text": "and then I hit it with another W2, I get" + }, + { + "start": 1328.679, + "text": "my output, right? Very straightforward" + }, + { + "start": 1330.48, + "text": "ReLU network." + }, + { + "start": 1333.48, + "text": "Another thing, um I don't say this as my" + }, + { + "start": 1335.72, + "text": "personal experience, but another thing" + }, + { + "start": 1337.44, + "text": "that is often said in architecture" + }, + { + "start": 1338.92, + "text": "design is that gating is often very" + }, + { + "start": 1341.28, + "text": "helpful. So, if you apply that very" + }, + { + "start": 1342.96, + "text": "general heuristic, what you might get is" + }, + { + "start": 1345.08, + "text": "to say, \"Okay, well, instead of just" + }, + { + "start": 1347.6, + "text": "having, you know, this entry-wise ReLU," + }, + { + "start": 1350.4, + "text": "why don't we also have a gate? And the" + }, + { + "start": 1352.24, + "text": "second gate, the second term here, this" + }, + { + "start": 1354.88, + "text": "is just going to multiply the output of" + }, + { + "start": 1356.96, + "text": "my ReLU entry-wise, and I have a second" + }, + { + "start": 1360.08, + "text": "matrix V, okay?\"" + }, + { + "start": 1362.0, + "text": "Now, this is just going to modulate the" + }, + { + "start": 1363.52, + "text": "output of my ReLU." + }, + { + "start": 1365.4, + "text": "Um and then I'm going to do everything" + }, + { + "start": 1367.6, + "text": "else the same, right? So, instead of," + }, + { + "start": 1369.92, + "text": "you know, just having XW1W2, I have XW1," + }, + { + "start": 1373.64, + "text": "and I'm going to gate that with XV. This" + }, + { + "start": 1375.72, + "text": "is another" + }, + { + "start": 1377.04, + "text": "uh activation the same size as this, and" + }, + { + "start": 1379.16, + "text": "then I'm going to, you know, down" + }, + { + "start": 1380.64, + "text": "project it back with W2." + }, + { + "start": 1383.16, + "text": "Okay, so what is this? Now, this is a" + }, + { + "start": 1384.84, + "text": "RegLU. This is a you you make these" + }, + { + "start": 1387.04, + "text": "names by adding the first activation, in" + }, + { + "start": 1388.96, + "text": "this case ReLU, and GLU, right? So, the" + }, + { + "start": 1391.12, + "text": "ReLU gated linear unit." + }, + { + "start": 1393.48, + "text": "Um and gating has been a, you know, very" + }, + { + "start": 1395.96, + "text": "effective other primitive in" + }, + { + "start": 1398.16, + "text": "architecture design, and it turns out" + }, + { + "start": 1400.12, + "text": "that this is very effective in language" + }, + { + "start": 1401.88, + "text": "modeling as well." + }, + { + "start": 1403.679, + "text": "So," + }, + { + "start": 1404.72, + "text": "um if you take something like a GELU," + }, + { + "start": 1406.96, + "text": "we've already talked about that, right?" + }, + { + "start": 1408.16, + "text": "That's like the ReLU, but with a little" + }, + { + "start": 1409.52, + "text": "divot at the bottom here, um you will" + }, + { + "start": 1411.679, + "text": "get a GeGLU. Um and if you take a" + }, + { + "start": 1414.52, + "text": "SwiGLU, which is X times a sigmoid, then" + }, + { + "start": 1417.6, + "text": "you will get a SwiGLU. So, this is a" + }, + { + "start": 1419.76, + "text": "Swish times, you know, the rest of it." + }, + { + "start": 1422.44, + "text": "Um and this really covers a lot of the" + }, + { + "start": 1425.24, + "text": "modern models, right? Um generally the" + }, + { + "start": 1427.679, + "text": "Google folks have used GeGLU, so like" + }, + { + "start": 1430.04, + "text": "the Gemma models, the T5 models are" + }, + { + "start": 1432.2, + "text": "those. Um and everything that's kind of" + }, + { + "start": 1434.36, + "text": "like a LLaMA descendant uses a SwiGLU." + }, + { + "start": 1437.16, + "text": "Um so, PaLM and the LLaMA descendants" + }, + { + "start": 1439.92, + "text": "are all kind of SwiGLU models. Um" + }, + { + "start": 1443.16, + "text": "I would say that SwiGLU is probably the" + }, + { + "start": 1444.96, + "text": "more dominant one, but honestly amongst" + }, + { + "start": 1447.2, + "text": "the gated units, doesn't really matter." + }, + { + "start": 1450.4, + "text": "Now, here's a side note that will uh be" + }, + { + "start": 1452.76, + "text": "a semi-important piece of trivia later." + }, + { + "start": 1456.04, + "text": "Um if you look up here, right? Um you" + }, + { + "start": 1459.0, + "text": "will notice that there are more" + }, + { + "start": 1460.32, + "text": "parameters for the gated uh model," + }, + { + "start": 1462.679, + "text": "right? Cuz I have this parameter of V." + }, + { + "start": 1465.12, + "text": "And so, if you do a little bit of math," + }, + { + "start": 1466.64, + "text": "right? I now have three matrices instead" + }, + { + "start": 1469.16, + "text": "of two matrices," + }, + { + "start": 1471.44, + "text": "right? What you should do is you should" + }, + { + "start": 1473.32, + "text": "maybe use a smaller feed forward" + }, + { + "start": 1475.32, + "text": "dimension by a factor of 2/3 in order to" + }, + { + "start": 1478.36, + "text": "keep the total parameter count the same," + }, + { + "start": 1480.4, + "text": "right? So, this is roughly the idea of," + }, + { + "start": 1482.56, + "text": "\"Well, I want to keep the same number of" + }, + { + "start": 1484.24, + "text": "total parameters as my original MLP, but" + }, + { + "start": 1486.8, + "text": "I now want to make it gated, so I'm" + }, + { + "start": 1488.52, + "text": "going to make the feed forward" + }, + { + "start": 1489.76, + "text": "dimension, which is the output dimension" + }, + { + "start": 1491.44, + "text": "of this W, a little bit smaller by 2/3," + }, + { + "start": 1494.6, + "text": "right?\" So, this is a general rule of" + }, + { + "start": 1496.04, + "text": "thumb that people have followed, but" + }, + { + "start": 1497.56, + "text": "it's not really an iron rule." + }, + { + "start": 1501.8, + "text": "You know, the original Noam Shazeer" + }, + { + "start": 1503.24, + "text": "paper that, you know, proposed this," + }, + { + "start": 1506.48, + "text": "had some, you know, very small deltas" + }, + { + "start": 1509.76, + "text": "originally, but they're consistent" + }, + { + "start": 1512.12, + "text": "deltas, and I think to his credit, um" + }, + { + "start": 1515.36, + "text": "I think a lot of his papers have these" + }, + { + "start": 1517.0, + "text": "like error bar assessments of like" + }, + { + "start": 1518.679, + "text": "training multiple replicates and" + }, + { + "start": 1520.28, + "text": "checking to see if they're better. Um" + }, + { + "start": 1522.6, + "text": "and if you look, the GLU variants are" + }, + { + "start": 1525.44, + "text": "almost always consistently better than" + }, + { + "start": 1527.679, + "text": "the non-GLU variants. And this is a" + }, + { + "start": 1529.56, + "text": "parameter matched comparison because um" + }, + { + "start": 1532.16, + "text": "Noam Shazeer is always doing this 2/3" + }, + { + "start": 1534.24, + "text": "adjustment to make sure that all of the" + }, + { + "start": 1536.2, + "text": "models have the total same total number" + }, + { + "start": 1538.84, + "text": "um of parameters." + }, + { + "start": 1540.84, + "text": "So, this is quite nice. It's in some" + }, + { + "start": 1543.0, + "text": "ways a free win. Um almost everyone uses" + }, + { + "start": 1546.08, + "text": "a GLU. There have been other sort of" + }, + { + "start": 1548.88, + "text": "more controlled systematic comparisons." + }, + { + "start": 1550.56, + "text": "This is uh the same paper I was talking" + }, + { + "start": 1552.4, + "text": "about before, Noam et al. in 2020." + }, + { + "start": 1554.76, + "text": "Um Google actually in the 2020s did" + }, + { + "start": 1557.84, + "text": "quite a few nice large-scale" + }, + { + "start": 1560.0, + "text": "architecture comparison papers, um" + }, + { + "start": 1562.04, + "text": "although with a T5 architecture and not" + }, + { + "start": 1564.56, + "text": "uh autoregressive uh language model. Um" + }, + { + "start": 1567.32, + "text": "and they, you know, basically" + }, + { + "start": 1568.4, + "text": "comprehensively compare things like" + }, + { + "start": 1569.84, + "text": "GLUs, and you see once again, um if we" + }, + { + "start": 1571.72, + "text": "look at the SwiGLU or the GeGLU or the" + }, + { + "start": 1573.96, + "text": "GLUs in general, they do significantly" + }, + { + "start": 1576.64, + "text": "better at loss or the other downstream" + }, + { + "start": 1578.56, + "text": "metrics, right?" + }, + { + "start": 1580.2, + "text": "Fairly compelling on paper uh on these" + }, + { + "start": 1581.96, + "text": "papers, also clear from now a lot of" + }, + { + "start": 1584.72, + "text": "model training runs that SwiGLU and GLU" + }, + { + "start": 1588.32, + "text": "are good, right?" + }, + { + "start": 1590.36, + "text": "So, there's a lot of variations in" + }, + { + "start": 1591.88, + "text": "gating, but really the important single" + }, + { + "start": 1593.88, + "text": "axis to know is that gating uh for these" + }, + { + "start": 1597.28, + "text": "nonlinearities is actually quite" + }, + { + "start": 1598.76, + "text": "important, gives you" + }, + { + "start": 1600.52, + "text": "uh nice boost without much of a" + }, + { + "start": 1602.08, + "text": "computational cost. Um you know, that's" + }, + { + "start": 1604.96, + "text": "not to say that gated linear units are" + }, + { + "start": 1606.64, + "text": "necessary. I mean, GPT-3 was that. Um I" + }, + { + "start": 1609.679, + "text": "think the NeMo Tron 340B model used a" + }, + { + "start": 1612.0, + "text": "squared ReLU, which is a kind of a crazy" + }, + { + "start": 1614.04, + "text": "choice, but that works, too. Um both of" + }, + { + "start": 1616.679, + "text": "these models are perfectly performant," + }, + { + "start": 1618.64, + "text": "but it's actually quite rare to see" + }, + { + "start": 1620.679, + "text": "anything that's not trained on a gated" + }, + { + "start": 1622.76, + "text": "linear unit, right? So, evidence is" + }, + { + "start": 1624.44, + "text": "pointing towards consistent gains on" + }, + { + "start": 1626.36, + "text": "using these gating tricks." + }, + { + "start": 1630.4, + "text": "So, those are I think the the more" + }, + { + "start": 1632.679, + "text": "consensus choices for things that we can" + }, + { + "start": 1635.08, + "text": "do in architecture. Um now, this one I" + }, + { + "start": 1637.6, + "text": "think is a really fun idea, but one that" + }, + { + "start": 1639.679, + "text": "I think now the test of time has shown" + }, + { + "start": 1642.08, + "text": "maybe is not quite as good or maybe not" + }, + { + "start": 1644.12, + "text": "as popular of an idea." + }, + { + "start": 1646.04, + "text": "Um normally, we do our transformer" + }, + { + "start": 1648.2, + "text": "blocks serially, right? We compute our" + }, + { + "start": 1650.24, + "text": "attention, then we compute the MLP," + }, + { + "start": 1652.52, + "text": "right? One after the other." + }, + { + "start": 1654.2, + "text": "Um if you're very systems-minded, you" + }, + { + "start": 1656.32, + "text": "might say, \"Well, this introduces a" + }, + { + "start": 1657.72, + "text": "bottleneck, right? I have to wait for" + }, + { + "start": 1659.52, + "text": "the computation of one to do the other." + }, + { + "start": 1662.0, + "text": "If they were instead in parallel, I" + }, + { + "start": 1663.679, + "text": "could bring to bear some new and cool" + }, + { + "start": 1665.48, + "text": "systems optimizations, potentially," + }, + { + "start": 1667.2, + "text": "right?\" So, you might ask, \"Could we" + }, + { + "start": 1668.88, + "text": "parallelize the transformer block?\"" + }, + { + "start": 1671.8, + "text": "And um this was originally an idea that" + }, + { + "start": 1675.16, + "text": "was in GPT-J, which is the open-source" + }, + { + "start": 1677.84, + "text": "attempted replication of GPT-3." + }, + { + "start": 1680.92, + "text": "Um and kind of very interestingly, I" + }, + { + "start": 1683.92, + "text": "think GPT-J has been surprisingly" + }, + { + "start": 1686.04, + "text": "influential in sort of propagating a lot" + }, + { + "start": 1689.32, + "text": "of ideas. I mean, PaLM as well. Google" + }, + { + "start": 1692.28, + "text": "um is actually surprisingly bold with" + }, + { + "start": 1694.2, + "text": "the architectures that they do. Um but" + }, + { + "start": 1696.16, + "text": "the description in uh PaLM, uh which you" + }, + { + "start": 1698.96, + "text": "can see in their report, is kind of the" + }, + { + "start": 1700.64, + "text": "following. Instead of nesting this," + }, + { + "start": 1702.32, + "text": "which is the sequential format at the" + }, + { + "start": 1703.6, + "text": "top, you know, you're just going to add" + }, + { + "start": 1705.28, + "text": "together the output of the MLP and" + }, + { + "start": 1707.32, + "text": "attention layer, and just add both of" + }, + { + "start": 1709.24, + "text": "those back into the residual stream." + }, + { + "start": 1712.24, + "text": "Um if you implement this right, you can" + }, + { + "start": 1714.6, + "text": "actually share a lot of the components." + }, + { + "start": 1716.12, + "text": "Like, you can share the layer norms, you" + }, + { + "start": 1717.64, + "text": "can fuse the matrix multiplies. This" + }, + { + "start": 1720.159, + "text": "allows you to potentially get additional" + }, + { + "start": 1722.2, + "text": "systems optimizations. Um" + }, + { + "start": 1725.08, + "text": "And I think a lot of the people that" + }, + { + "start": 1726.76, + "text": "have been influenced by Google, so" + }, + { + "start": 1728.88, + "text": "Cohere, you know, was founded from one" + }, + { + "start": 1730.64, + "text": "of the former" + }, + { + "start": 1731.96, + "text": "uh transformer authors, they do a lot of" + }, + { + "start": 1734.08, + "text": "Google-inspired optimizations. They" + }, + { + "start": 1735.64, + "text": "followed kind of this architecture. Um" + }, + { + "start": 1738.28, + "text": "but not very many others. Um this has" + }, + { + "start": 1740.08, + "text": "been a approach that has really fallen" + }, + { + "start": 1741.96, + "text": "out of popularity over the past, I" + }, + { + "start": 1743.4, + "text": "think, 2 years. Um I think mainly" + }, + { + "start": 1745.64, + "text": "because optimization of the serial form" + }, + { + "start": 1748.04, + "text": "has gotten sufficiently good that the" + }, + { + "start": 1749.6, + "text": "systems gains from the second one just" + }, + { + "start": 1751.76, + "text": "isn't worth the small hits to uh" + }, + { + "start": 1754.72, + "text": "representation power that you end up" + }, + { + "start": 1756.24, + "text": "getting going from uh parallel to" + }, + { + "start": 1758.6, + "text": "serial." + }, + { + "start": 1759.84, + "text": "Effectively, you can think about it as" + }, + { + "start": 1761.48, + "text": "you've lost half of your depth, right?" + }, + { + "start": 1763.08, + "text": "And that can be" + }, + { + "start": 1764.44, + "text": "uh a deleterious" + }, + { + "start": 1766.36, + "text": "uh thing to do to your model." + }, + { + "start": 1770.52, + "text": "So, in terms of the architecture things," + }, + { + "start": 1772.96, + "text": "actually, you know, the fact that this" + }, + { + "start": 1774.4, + "text": "is so short should kind of suggest to" + }, + { + "start": 1776.679, + "text": "you how much the original transformer" + }, + { + "start": 1779.36, + "text": "formulation has somewhat stood the test" + }, + { + "start": 1781.48, + "text": "of time, right? Cuz the only thing I'm" + }, + { + "start": 1782.88, + "text": "really talking about changing here" + }, + { + "start": 1785.56, + "text": "um is, you know, where the norms go, or" + }, + { + "start": 1788.0, + "text": "you know, whether we have bias terms, or" + }, + { + "start": 1790.28, + "text": "whether we gate the MLPs, but those are" + }, + { + "start": 1792.08, + "text": "actually pretty minor changes compared" + }, + { + "start": 1794.36, + "text": "to all the things that you can do." + }, + { + "start": 1796.44, + "text": "Now, uh, those of you that are, you" + }, + { + "start": 1798.48, + "text": "know, sort of carefully paying attention" + }, + { + "start": 1799.84, + "text": "might say, but wait, you know, there's a" + }, + { + "start": 1801.52, + "text": "lot of transformer alternatives that" + }, + { + "start": 1803.2, + "text": "change the attention. Um, yes, you'll" + }, + { + "start": 1805.4, + "text": "have to wait until next lecture because" + }, + { + "start": 1807.64, + "text": "today I'm just only going to cover sort" + }, + { + "start": 1809.24, + "text": "of core attention based methods. Um, and" + }, + { + "start": 1812.4, + "text": "next lecture I'll throw in a little bit" + }, + { + "start": 1813.84, + "text": "of a state space model stuff, but as" + }, + { + "start": 1816.28, + "text": "long as you're in this like dense" + }, + { + "start": 1817.4, + "text": "attention land, actually the" + }, + { + "start": 1819.16, + "text": "architecture from the original" + }, + { + "start": 1820.32, + "text": "transformer paper is pretty close to" + }, + { + "start": 1822.48, + "text": "what we do." + }, + { + "start": 1823.92, + "text": "So, you see, uh, quite a bit of this," + }, + { + "start": 1825.56, + "text": "right? So, uh, just now going back to" + }, + { + "start": 1827.4, + "text": "this, blue here is RMS norm block as" + }, + { + "start": 1829.76, + "text": "layer norm. You see most of the modern" + }, + { + "start": 1831.88, + "text": "models are sort of RMS norm models." + }, + { + "start": 1834.08, + "text": "Serial versus parallel layers, the blue" + }, + { + "start": 1835.84, + "text": "one's parallel, the rest is serial. You" + }, + { + "start": 1837.32, + "text": "see mostly serial layers. Um, pre-norm" + }, + { + "start": 1840.08, + "text": "versus post-norm. Some of these, uh," + }, + { + "start": 1842.0, + "text": "ones that I marked as post-norm are" + }, + { + "start": 1843.64, + "text": "actually pre and post-norm." + }, + { + "start": 1845.8, + "text": "Um, and then these ones on the right," + }, + { + "start": 1848.0, + "text": "these are GLUs, uh, almost always with" + }, + { + "start": 1850.6, + "text": "the exception of things like, uh," + }, + { + "start": 1852.04, + "text": "Falcon, which use a gated linear unit," + }, + { + "start": 1854.08, + "text": "but almost all of these are really, uh," + }, + { + "start": 1856.04, + "text": "gated linear units for modern models." + }, + { + "start": 1859.4, + "text": "So, you can see the trends quite" + }, + { + "start": 1860.84, + "text": "visually, um," + }, + { + "start": 1862.44, + "text": "from what I'm telling you." + }, + { + "start": 1864.16, + "text": "Okay. So, really the thing that is very" + }, + { + "start": 1867.48, + "text": "different across implementations, and I" + }, + { + "start": 1869.6, + "text": "think a place where a lot of the" + }, + { + "start": 1870.92, + "text": "architecture stuff is still in flux, is" + }, + { + "start": 1873.44, + "text": "how you do kind of position dependence" + }, + { + "start": 1876.16, + "text": "and incorporate information from other" + }, + { + "start": 1877.92, + "text": "positions, right? So, the core attention" + }, + { + "start": 1879.92, + "text": "component in some sense." + }, + { + "start": 1881.92, + "text": "Um, so there are lots of different ways" + }, + { + "start": 1884.64, + "text": "that you can encode position into a" + }, + { + "start": 1886.28, + "text": "transformer." + }, + { + "start": 1887.92, + "text": "And just so you know, to to remind you," + }, + { + "start": 1889.68, + "text": "right? This is very, very important" + }, + { + "start": 1891.44, + "text": "because attention is positionally" + }, + { + "start": 1893.2, + "text": "independent, right? They're just inner" + }, + { + "start": 1894.48, + "text": "products, so you can just shuffle them" + }, + { + "start": 1896.4, + "text": "and attention would be the same if you" + }, + { + "start": 1898.08, + "text": "don't have a position embedding." + }, + { + "start": 1900.0, + "text": "The original transformer had sine and" + }, + { + "start": 1901.44, + "text": "cosine embeddings, kind of like a" + }, + { + "start": 1903.08, + "text": "Fourier transform intuition that if you" + }, + { + "start": 1904.88, + "text": "have sines and cosines, then you can" + }, + { + "start": 1906.64, + "text": "kind of recover position from that no" + }, + { + "start": 1908.24, + "text": "matter what." + }, + { + "start": 1909.52, + "text": "Um, a number of other sort of large" + }, + { + "start": 1911.6, + "text": "models that, you know, followed soon" + }, + { + "start": 1913.64, + "text": "after that used absolute embeddings," + }, + { + "start": 1915.88, + "text": "where each position had its own" + }, + { + "start": 1917.88, + "text": "different embedding." + }, + { + "start": 1919.32, + "text": "Um, and then, uh, several other sort of" + }, + { + "start": 1922.2, + "text": "Google models like to use relative" + }, + { + "start": 1924.32, + "text": "embedding. So, in here you're not" + }, + { + "start": 1925.76, + "text": "adding, um, embeddings into the into the" + }, + { + "start": 1928.08, + "text": "embedding, uh, like word vector" + }, + { + "start": 1930.0, + "text": "embeddings, but instead you're adding a" + }, + { + "start": 1932.11, + "text": "s, but instead you're adding a" + }, + { + "start": 1932.12, + "text": "vector to the attention computation" + }, + { + "start": 1933.76, + "text": "itself, right? So, if you're three" + }, + { + "start": 1935.04, + "text": "positions off, sort of the attention" + }, + { + "start": 1936.8, + "text": "matrix gets a different offset added to" + }, + { + "start": 1939.16, + "text": "it. And and, you know, models like T5" + }, + { + "start": 1941.2, + "text": "and Chinchilla use kind of this scheme." + }, + { + "start": 1943.6, + "text": "Um," + }, + { + "start": 1944.8, + "text": "the thing that has really become pretty" + }, + { + "start": 1947.04, + "text": "dominant in terms of position embedding" + }, + { + "start": 1949.36, + "text": "is this class of embeddings called rope," + }, + { + "start": 1952.12, + "text": "which some of you may be familiar with." + }, + { + "start": 1954.04, + "text": "Um, most models past 2024 use this type" + }, + { + "start": 1956.88, + "text": "of embedding. And it's kind of" + }, + { + "start": 1957.88, + "text": "remarkable given that rope, you know, in" + }, + { + "start": 1960.28, + "text": "some ways came out of nowhere." + }, + { + "start": 1962.32, + "text": "Um, originally I think this was also a" + }, + { + "start": 1964.16, + "text": "GPT-J innovation, um," + }, + { + "start": 1966.679, + "text": "from I think, uh, sort of not very well" + }, + { + "start": 1969.28, + "text": "known sort of blog post and, uh, paper" + }, + { + "start": 1971.96, + "text": "combination, uh, from an author in" + }, + { + "start": 1973.72, + "text": "China. Um, but really it has some really" + }, + { + "start": 1976.88, + "text": "interesting ideas for for why you would" + }, + { + "start": 1979.32, + "text": "do something like rope." + }, + { + "start": 1981.28, + "text": "So, rope, you know, is a relative" + }, + { + "start": 1983.16, + "text": "position embedding. And a relative" + }, + { + "start": 1985.28, + "text": "position embedding, let's make an" + }, + { + "start": 1987.07, + "text": ", let's make an" + }, + { + "start": 1987.08, + "text": "opinionated stance that I should not" + }, + { + "start": 1990.12, + "text": "care about the absolute position of any" + }, + { + "start": 1992.88, + "text": "words. So, if, you know, A uh, an apple" + }, + { + "start": 1995.92, + "text": "appear together, even if it appears at" + }, + { + "start": 1997.92, + "text": "the start or at the end, right? In rope" + }, + { + "start": 2000.28, + "text": "embeddings, they should kind of get the" + }, + { + "start": 2001.84, + "text": "same, uh, sort of result. Um, and we do" + }, + { + "start": 2005.6, + "text": "know that, you know, or and we want to" + }, + { + "start": 2008.88, + "text": "sort of represent it in this way, right?" + }, + { + "start": 2010.679, + "text": "So, I have an embedding F, and I have" + }, + { + "start": 2013.24, + "text": "another embedding F, and these are going" + }, + { + "start": 2015.04, + "text": "to take in the identity of the words X" + }, + { + "start": 2016.76, + "text": "and Y and the positions absolute of I" + }, + { + "start": 2019.2, + "text": "and J." + }, + { + "start": 2020.32, + "text": "And I want this to be equal if I take" + }, + { + "start": 2022.12, + "text": "the inner product of these embeddings to" + }, + { + "start": 2023.8, + "text": "be equal to a function that only depends" + }, + { + "start": 2027.52, + "text": "on the relative difference." + }, + { + "start": 2029.72, + "text": "Right? Um, and every existing embedding" + }, + { + "start": 2031.88, + "text": "before it didn't really fulfill this" + }, + { + "start": 2033.44, + "text": "equality. Like sine is not relative" + }, + { + "start": 2035.96, + "text": "because it has these absolute cross" + }, + { + "start": 2037.679, + "text": "terms that are not relative. Absolute" + }, + { + "start": 2039.76, + "text": "position embeddings, just by the the" + }, + { + "start": 2041.32, + "text": "name of it, is obviously not relative." + }, + { + "start": 2043.84, + "text": "And then relative embeddings," + }, + { + "start": 2045.8, + "text": "technically these are relative, but" + }, + { + "start": 2047.36, + "text": "they're not kind of embeddings because" + }, + { + "start": 2049.2, + "text": "they're just adding to the attention" + }, + { + "start": 2050.59, + "text": "just adding to the attention" + }, + { + "start": 2050.6, + "text": "matrix, right? So, there's no inner" + }, + { + "start": 2051.919, + "text": "product structure that, you know, you" + }, + { + "start": 2053.919, + "text": "can extract out of the" + }, + { + "start": 2056.04, + "text": "So, given this, you might ask, is there" + }, + { + "start": 2058.879, + "text": "a nice way that we can truly have this" + }, + { + "start": 2061.76, + "text": "relative embedding?" + }, + { + "start": 2063.679, + "text": "And the idea is very cool. Um, it's" + }, + { + "start": 2065.56, + "text": "really just looking at kind of uh," + }, + { + "start": 2067.96, + "text": "properties about angles and cosines. So," + }, + { + "start": 2071.6, + "text": "we want our embeddings to be invariant" + }, + { + "start": 2073.399, + "text": "to absolute positions, and we know that" + }, + { + "start": 2075.12, + "text": "inner products of any kind are invariant" + }, + { + "start": 2077.72, + "text": "to arbitrary rotation, right? So, the" + }, + { + "start": 2080.32, + "text": "idea is to say, I'm going to take my" + }, + { + "start": 2082.64, + "text": "semantic word vectors, the ones that are" + }, + { + "start": 2085.04, + "text": "are independent of any position. So," + }, + { + "start": 2087.0, + "text": "this is my starting point. And then I'm" + }, + { + "start": 2089.28, + "text": "going to rotate each of these vectors," + }, + { + "start": 2091.24, + "text": "in this case in 2D, um, based on the" + }, + { + "start": 2094.679, + "text": "position that the words appear. So, you" + }, + { + "start": 2097.48, + "text": "know, just as a" + }, + { + "start": 2099.16, + "text": "uh, simple example, we, let's say we" + }, + { + "start": 2101.64, + "text": "have the uh, sentence, we know that," + }, + { + "start": 2104.12, + "text": "right? We appear at position zero, so" + }, + { + "start": 2106.72, + "text": "I'm not going to touch that at all," + }, + { + "start": 2107.92, + "text": "right? I'm just going to keep that where" + }, + { + "start": 2109.0, + "text": "it is." + }, + { + "start": 2110.04, + "text": "The word know is at position one, so I'm" + }, + { + "start": 2112.72, + "text": "going to rotate it by some angle, right?" + }, + { + "start": 2114.96, + "text": "And that's my my one position rotation." + }, + { + "start": 2117.96, + "text": "Now, what happens if I apply the same" + }, + { + "start": 2119.8, + "text": "idea to, uh, the following sequence, of" + }, + { + "start": 2123.52, + "text": "course we know, right? In this case, we" + }, + { + "start": 2125.76, + "text": "and know are still adjacent, they're" + }, + { + "start": 2127.04, + "text": "right next to each other, but their" + }, + { + "start": 2128.64, + "text": "absolute position is shifted, right? Of" + }, + { + "start": 2130.52, + "text": "course, you know, comes before we know" + }, + { + "start": 2132.16, + "text": "now. In this case, I'm going to rotate" + }, + { + "start": 2134.56, + "text": "the word we by two positions because" + }, + { + "start": 2136.52, + "text": "it's two index, right? 0 1 2. So, the" + }, + { + "start": 2139.68, + "text": "word we is in the second, uh, position" + }, + { + "start": 2141.76, + "text": "number two, so I rotate by two. I rotate" + }, + { + "start": 2144.52, + "text": "know by three positions cuz it's in" + }, + { + "start": 2146.16, + "text": "position number three, and what do you" + }, + { + "start": 2148.04, + "text": "know, the relative angle between these" + }, + { + "start": 2149.88, + "text": "two is still separated by one, right?" + }, + { + "start": 2152.64, + "text": "So, this is a very, very simple idea of" + }, + { + "start": 2155.12, + "text": "just using rotations, uh, to represent," + }, + { + "start": 2158.24, + "text": "um," + }, + { + "start": 2159.32, + "text": "position. And if we do that, then" + }, + { + "start": 2160.96, + "text": "anytime we take an inner product, those" + }, + { + "start": 2162.68, + "text": "inner products are going to be invariant" + }, + { + "start": 2164.68, + "text": "of absolute positions." + }, + { + "start": 2166.72, + "text": "Now, you might say, well, in two" + }, + { + "start": 2168.64, + "text": "dimensions that's pretty easy cuz you've" + }, + { + "start": 2170.32, + "text": "only really got one choice, you got" + }, + { + "start": 2171.8, + "text": "clockwise and counterclockwise, but in" + }, + { + "start": 2173.96, + "text": "high dimensions, there's an infinite" + }, + { + "start": 2176.2, + "text": "space of ways that you can rotate" + }, + { + "start": 2177.8, + "text": "vectors. So, what do you do in D" + }, + { + "start": 2179.12, + "text": "dimensions? Um, well, you do the" + }, + { + "start": 2181.16, + "text": "simplest possible thing and it works." + }, + { + "start": 2182.96, + "text": "The simplest possible thing is to reduce" + }, + { + "start": 2185.4, + "text": "it to the 2D case repeatedly. So, you" + }, + { + "start": 2187.24, + "text": "have a D-dimensional vector, just cut it" + }, + { + "start": 2189.24, + "text": "up into chunks of two, and each pair of" + }, + { + "start": 2192.64, + "text": "two dimensions gets rotated. And the" + }, + { + "start": 2195.359, + "text": "theta at which these things rotate vary," + }, + { + "start": 2197.84, + "text": "right? Some of them are very low" + }, + { + "start": 2199.4, + "text": "frequency, so they rotate very slowly," + }, + { + "start": 2201.24, + "text": "so they uh, they can capture long-range" + }, + { + "start": 2203.32, + "text": "dependence. Some of them rotate very" + }, + { + "start": 2205.24, + "text": "quickly, so they capture things like," + }, + { + "start": 2206.76, + "text": "are they neighbors to each other, right?" + }, + { + "start": 2208.96, + "text": "Um, and then at the end, you know, after" + }, + { + "start": 2211.24, + "text": "I've rotated every pair of vectors, I" + }, + { + "start": 2213.359, + "text": "get sort of my final embeddings. So," + }, + { + "start": 2215.28, + "text": "this is the, you know, rope approach." + }, + { + "start": 2218.84, + "text": "the paper, if you read it, has a very" + }, + { + "start": 2221.44, + "text": "complex motivation about complex" + }, + { + "start": 2223.52, + "text": "numbers, but really I think the" + }, + { + "start": 2225.4, + "text": "intuitive way, at least to me, to think" + }, + { + "start": 2226.92, + "text": "about it, is to just you want to rotate" + }, + { + "start": 2229.48, + "text": "by reducing to the two-dimensional case," + }, + { + "start": 2231.4, + "text": "and you're just rotating every pair of" + }, + { + "start": 2232.92, + "text": "coordinates." + }, + { + "start": 2235.56, + "text": "Gemma 4 just came out on Thursday, and" + }, + { + "start": 2238.0, + "text": "they have like another different kind of" + }, + { + "start": 2239.76, + "text": "fun thing that they do, which they call," + }, + { + "start": 2241.64, + "text": "um, I think proportional rope or P-rope," + }, + { + "start": 2244.48, + "text": "um, which is a really strange way to" + }, + { + "start": 2246.48, + "text": "just say that the only thing they rotate" + }, + { + "start": 2247.92, + "text": "is the first two coordinates, but that's" + }, + { + "start": 2249.32, + "text": "another valid thing that you can do as" + }, + { + "start": 2251.28, + "text": "well. So, there's a lot of different" + }, + { + "start": 2252.4, + "text": "things that you can do in this space" + }, + { + "start": 2253.68, + "text": "that end up working." + }, + { + "start": 2255.6, + "text": "Okay. In practice, what you're going to" + }, + { + "start": 2258.4, + "text": "end up doing is, you know, you can take" + }, + { + "start": 2261.16, + "text": "your vector and you can make a sparse" + }, + { + "start": 2263.24, + "text": "multiply with sines and cosines, and" + }, + { + "start": 2265.52, + "text": "this is going to be giving you some way" + }, + { + "start": 2267.4, + "text": "of rotating your input vectors X's," + }, + { + "start": 2269.64, + "text": "right? So, X times, uh, W times R, this" + }, + { + "start": 2273.12, + "text": "is going to be your final embedding that" + }, + { + "start": 2274.76, + "text": "you get." + }, + { + "start": 2277.24, + "text": "Um, and finally, you know, this is a" + }, + { + "start": 2278.8, + "text": "sine and cosine, which looks a little" + }, + { + "start": 2280.6, + "text": "like sine embeddings, but it's really" + }, + { + "start": 2282.72, + "text": "important that I'm multiplying with" + }, + { + "start": 2284.28, + "text": "these sines and cosines rather than" + }, + { + "start": 2285.72, + "text": "using them as embeddings cuz that means" + }, + { + "start": 2287.64, + "text": "that there are no cross terms. Um, and" + }, + { + "start": 2289.44, + "text": "this is purely relative, right? There's" + }, + { + "start": 2291.2, + "text": "no absolute position information that" + }, + { + "start": 2292.8, + "text": "you'll get out of inner products." + }, + { + "start": 2295.4, + "text": "Um, if we really, really wanted to get" + }, + { + "start": 2297.08, + "text": "into low-level details and you ask like," + }, + { + "start": 2298.68, + "text": "how do I actually implement this thing," + }, + { + "start": 2300.28, + "text": "you know, you're going to have to do" + }, + { + "start": 2301.16, + "text": "that. Um, you have your usual attention" + }, + { + "start": 2304.0, + "text": "stuff, and then what you do is you" + }, + { + "start": 2305.52, + "text": "generate cosine and sine angles, um," + }, + { + "start": 2307.84, + "text": "based on the position IDs of sort of" + }, + { + "start": 2309.96, + "text": "where your sequence is, and then you're" + }, + { + "start": 2312.2, + "text": "going to apply those cosines and sines" + }, + { + "start": 2314.28, + "text": "onto both your queries and keys for your" + }, + { + "start": 2316.68, + "text": "sort of attention computation, um, and" + }, + { + "start": 2318.68, + "text": "you can either apply them as a matrix" + }, + { + "start": 2319.96, + "text": "multiply or you can go through and apply" + }, + { + "start": 2321.28, + "text": "them manually, uh, just as a rotation," + }, + { + "start": 2323.56, + "text": "right? Fairly straightforward, and you" + }, + { + "start": 2324.84, + "text": "would do this at the attention level" + }, + { + "start": 2326.88, + "text": "rather than at the very bottom to sort" + }, + { + "start": 2328.64, + "text": "of enforce position invariance every" + }, + { + "start": 2330.64, + "text": "time you're doing attention" + }, + { + "start": 2331.56, + "text": "computations." + }, + { + "start": 2333.359, + "text": "Okay. So, that was rope. Um, it is a" + }, + { + "start": 2335.52, + "text": "little bit confusing, but once you" + }, + { + "start": 2336.88, + "text": "understand the geometry of just rotating" + }, + { + "start": 2338.64, + "text": "things, it's actually fairly, uh," + }, + { + "start": 2340.32, + "text": "straightforward." + }, + { + "start": 2342.04, + "text": "Okay. I'm going to pause here for one" + }, + { + "start": 2344.56, + "text": "moment, um, in case anyone has any" + }, + { + "start": 2346.48, + "text": "questions about the various like" + }, + { + "start": 2348.48, + "text": "architecture bits. Um, we're going to" + }, + { + "start": 2350.4, + "text": "then talk about even lower-level details" + }, + { + "start": 2353.2, + "text": "about hyper parameters. So, yes." + }, + { + "start": 2356.8, + "text": "Do you know about any papers that do a" + }, + { + "start": 2357.92, + "text": "higher-dimensional rotation?" + }, + { + "start": 2359.64, + "text": "Higher-dimensional rotation never" + }, + { + "start": 2361.28, + "text": "worked?" + }, + { + "start": 2362.04, + "text": "It's a good question." + }, + { + "start": 2363.92, + "text": "I don't think so. By a" + }, + { + "start": 2364.88, + "text": "higher-dimensional rotation, like any," + }, + { + "start": 2366.72, + "text": "you know, 2D rotation in the space would" + }, + { + "start": 2368.88, + "text": "just be kind of a variant of this. You" + }, + { + "start": 2371.16, + "text": "could certainly do like any one manifold" + }, + { + "start": 2373.68, + "text": "that like is a closed loop. I have not" + }, + { + "start": 2375.16, + "text": "seen that." + }, + { + "start": 2376.88, + "text": "Yes. What do you recommend for this?" + }, + { + "start": 2379.04, + "text": "What do you think is the best way to" + }, + { + "start": 2380.48, + "text": "distill this kind of knowledge problem." + }, + { + "start": 2383.48, + "text": "People who are back to work boards." + }, + { + "start": 2385.64, + "text": "It's a good question. Um" + }, + { + "start": 2389.16, + "text": "I don't know if there's a way beyond" + }, + { + "start": 2391.72, + "text": "some combination of like looking broadly" + }, + { + "start": 2393.68, + "text": "enough to get a to get to get a pattern," + }, + { + "start": 2395.32, + "text": "which is what I the procedure I'm trying" + }, + { + "start": 2396.8, + "text": "to do in this lecture here. And then the" + }, + { + "start": 2398.44, + "text": "other one is to try it yourself even a" + }, + { + "start": 2400.08, + "text": "much smaller scale to form an intuition" + }, + { + "start": 2402.08, + "text": "and like a theory for how these things" + }, + { + "start": 2403.64, + "text": "come together. I think those two are" + }, + { + "start": 2405.16, + "text": "really the right ways. I think reading" + }, + { + "start": 2407.04, + "text": "any single paper in isolation is very" + }, + { + "start": 2408.8, + "text": "very difficult especially now because no" + }, + { + "start": 2411.72, + "text": "single paper seems to give any full" + }, + { + "start": 2413.56, + "text": "detail for a lot of language models" + }, + { + "start": 2415.08, + "text": "these days." + }, + { + "start": 2417.8, + "text": "Oh, lots of questions now. Okay, good." + }, + { + "start": 2419.52, + "text": "We'll go in Yeah. Um so I have a" + }, + { + "start": 2421.68, + "text": "question about the question on the" + }, + { + "start": 2422.96, + "text": "parallel layers and the the serial" + }, + { + "start": 2424.84, + "text": "layers. Yeah, I understand the modern" + }, + { + "start": 2427.0, + "text": "models are" + }, + { + "start": 2428.2, + "text": "thinking of the resource efficiency. So" + }, + { + "start": 2430.36, + "text": "they will use the parallel layers. They" + }, + { + "start": 2432.24, + "text": "have the" + }, + { + "start": 2433.0, + "text": "idea but there's there's a difference" + }, + { + "start": 2434.72, + "text": "there's a there's a big difference" + }, + { + "start": 2435.92, + "text": "between the accuracy um for these two" + }, + { + "start": 2438.52, + "text": "patterns, right? I want to know like" + }, + { + "start": 2439.68, + "text": "what's the What's the" + }, + { + "start": 2441.28, + "text": "What's the difference of of the" + }, + { + "start": 2442.52, + "text": "accuracy? Is it big enough to Is it" + }, + { + "start": 2444.56, + "text": "small enough to allow the current model" + }, + { + "start": 2446.84, + "text": "trainers to ignore that or is there any" + }, + { + "start": 2449.36, + "text": "problem? Yeah, I think you know, the" + }, + { + "start": 2452.4, + "text": "that's actually really mixed. So if you" + }, + { + "start": 2454.28, + "text": "read the original Palm paper, I think" + }, + { + "start": 2455.84, + "text": "they're like very confident about the" + }, + { + "start": 2457.76, + "text": "use of parallel layers like no" + }, + { + "start": 2459.08, + "text": "performance drop 15% systems utilization" + }, + { + "start": 2461.56, + "text": "improvement. So if you read just that" + }, + { + "start": 2463.2, + "text": "you'll kind of say like oh, it's just as" + }, + { + "start": 2464.8, + "text": "good. Um but I think a lot of the the" + }, + { + "start": 2467.52, + "text": "later Google models have stopped using" + }, + { + "start": 2469.24, + "text": "this, which you can take on as an" + }, + { + "start": 2471.24, + "text": "implicit signal that actually there" + }, + { + "start": 2472.32, + "text": "might be some losses. And once again," + }, + { + "start": 2474.68, + "text": "this one is a little bit hard to um to" + }, + { + "start": 2477.36, + "text": "get precise numbers on because no one's" + }, + { + "start": 2479.04, + "text": "done the ablations as far as I know on" + }, + { + "start": 2480.56, + "text": "parallel versus serial um controlled" + }, + { + "start": 2482.56, + "text": "nice ablations at least." + }, + { + "start": 2485.28, + "text": "Yeah." + }, + { + "start": 2486.36, + "text": "So yeah, so" + }, + { + "start": 2487.76, + "text": "what's the difference between like Eagle" + }, + { + "start": 2489.6, + "text": "and RoPE?" + }, + { + "start": 2494.92, + "text": "Yeah, yeah. I mean this difference is" + }, + { + "start": 2496.72, + "text": "really just like which of the" + }, + { + "start": 2497.84, + "text": "coordinates you're rotating." + }, + { + "start": 2499.44, + "text": "Like you don't rotate most of them" + }, + { + "start": 2500.88, + "text": "because a lot of the I mean the argument" + }, + { + "start": 2502.92, + "text": "originally I think is that the low" + }, + { + "start": 2505.12, + "text": "frequency parts just aren't rotating" + }, + { + "start": 2507.48, + "text": "very much. And so you can drop them if" + }, + { + "start": 2509.68, + "text": "you're really strapped for you know," + }, + { + "start": 2512.12, + "text": "sort of extra space. And these this is" + }, + { + "start": 2514.2, + "text": "really a optimization for teeny tiny" + }, + { + "start": 2516.0, + "text": "models where like you don't have very" + }, + { + "start": 2517.48, + "text": "much like hidden dimensions to to have" + }, + { + "start": 2519.72, + "text": "activations for." + }, + { + "start": 2523.56, + "text": "For the relative embeddings not having" + }, + { + "start": 2526.6, + "text": "an inner product," + }, + { + "start": 2528.32, + "text": "um is that cuz it only applies to keys" + }, + { + "start": 2530.52, + "text": "specifically? I'm trying to understand" + }, + { + "start": 2532.12, + "text": "the logic. Yeah, so they applied both" + }, + { + "start": 2533.64, + "text": "the the keys and values, which is kind" + }, + { + "start": 2535.84, + "text": "of why you know, you get this like" + }, + { + "start": 2537.4, + "text": "relative effect from where you are. Um" + }, + { + "start": 2541.04, + "text": "you want to not have cross terms, right?" + }, + { + "start": 2543.12, + "text": "So so if you look at the sine and cosine" + }, + { + "start": 2544.76, + "text": "embeddings, then you'll not only get" + }, + { + "start": 2547.04, + "text": "sort of the you know, the original" + }, + { + "start": 2549.56, + "text": "vectors, you'll kind of get these weird" + }, + { + "start": 2551.32, + "text": "cross terms between the position" + }, + { + "start": 2552.52, + "text": "embeddings and the word embedding" + }, + { + "start": 2554.92, + "text": "themselves and so on and so forth. And" + }, + { + "start": 2557.16, + "text": "then you can kind of back out what the" + }, + { + "start": 2558.4, + "text": "absolute position is. So even sine and" + }, + { + "start": 2560.56, + "text": "cosine embeddings are not like pure" + }, + { + "start": 2562.4, + "text": "relative position embeddings. Um" + }, + { + "start": 2565.0, + "text": "you know, you have to accept the premise" + }, + { + "start": 2566.56, + "text": "that you know, the relative embedding is" + }, + { + "start": 2568.24, + "text": "what you want. But once you do kind of" + }, + { + "start": 2570.24, + "text": "you end up at the RoPE solution somewhat" + }, + { + "start": 2572.24, + "text": "naturally." + }, + { + "start": 2580.8, + "text": "So what's the issue with So the issue" + }, + { + "start": 2582.48, + "text": "with this is that it just can't be" + }, + { + "start": 2583.76, + "text": "factorized as an inner product. That's" + }, + { + "start": 2585.36, + "text": "more of an aesthetic problem, right?" + }, + { + "start": 2586.64, + "text": "Like if if your constraints are I need" + }, + { + "start": 2588.8, + "text": "it to be uh relative and I need it to" + }, + { + "start": 2591.32, + "text": "factorize as f of xi and f of yj, then" + }, + { + "start": 2594.96, + "text": "this is not a solution in that class. Um" + }, + { + "start": 2597.16, + "text": "to be fair, there's a lot of um" + }, + { + "start": 2599.08, + "text": "embeddings that work this way that do" + }, + { + "start": 2601.04, + "text": "work like Alibi and other kinds of like" + }, + { + "start": 2603.24, + "text": "approaches like do do this kind of" + }, + { + "start": 2605.28, + "text": "inject into the attention matrix and" + }, + { + "start": 2607.12, + "text": "they do reasonably well. Um it's not" + }, + { + "start": 2610.92, + "text": "necessarily the one that's become the" + }, + { + "start": 2612.2, + "text": "dominant approach is what I can say." + }, + { + "start": 2616.16, + "text": "Cool. Okay." + }, + { + "start": 2617.96, + "text": "Great." + }, + { + "start": 2621.44, + "text": "Now we'll talk about hyperparameters um" + }, + { + "start": 2623.64, + "text": "and I think hyperparameters are really" + }, + { + "start": 2624.96, + "text": "something that you start to engage with" + }, + { + "start": 2626.48, + "text": "once you like actually have to train a" + }, + { + "start": 2628.2, + "text": "model, right? When your knowledge about" + }, + { + "start": 2630.36, + "text": "language models are abstract, you don't" + }, + { + "start": 2632.04, + "text": "have to care about any of these. But" + }, + { + "start": 2633.4, + "text": "once you have to instantiate it, you" + }, + { + "start": 2634.72, + "text": "start to ask questions like well, how" + }, + { + "start": 2637.12, + "text": "big should the feed forward size be?" + }, + { + "start": 2639.48, + "text": "Um how many heads should I have?" + }, + { + "start": 2641.76, + "text": "Um what should my vocab size be, right?" + }, + { + "start": 2645.0, + "text": "and you might also have questions of" + }, + { + "start": 2646.44, + "text": "like what should my weight decay or" + }, + { + "start": 2647.88, + "text": "dropout be? Like do I even need to" + }, + { + "start": 2649.48, + "text": "regularize? I I have a lot of tokens," + }, + { + "start": 2651.24, + "text": "right? So do I need regularization?" + }, + { + "start": 2653.6, + "text": "Um and do I need very deep models or" + }, + { + "start": 2655.56, + "text": "very wide models? Like what are the the" + }, + { + "start": 2657.359, + "text": "right kinds of things to do here, right?" + }, + { + "start": 2659.76, + "text": "Um and all of these if you start out" + }, + { + "start": 2661.359, + "text": "with no knowledge, it's actually very" + }, + { + "start": 2662.76, + "text": "daunting because you have to search this" + }, + { + "start": 2664.24, + "text": "like very big high dimensional space." + }, + { + "start": 2666.76, + "text": "Um but the space of things that people" + }, + { + "start": 2668.56, + "text": "try is actually pretty small. And from" + }, + { + "start": 2670.44, + "text": "that maybe you can start to think about" + }, + { + "start": 2671.96, + "text": "you know, smarter search processes of" + }, + { + "start": 2673.68, + "text": "like where you want to vary things." + }, + { + "start": 2677.32, + "text": "One of the things that's a really" + }, + { + "start": 2678.4, + "text": "consensus hyperparameter" + }, + { + "start": 2680.72, + "text": "um is this idea of the uh" + }, + { + "start": 2683.28, + "text": "ratio between the feed forward size," + }, + { + "start": 2685.24, + "text": "which is kind of the output of your" + }, + { + "start": 2686.52, + "text": "first matrix multiply in an MLP, and the" + }, + { + "start": 2689.28, + "text": "model dimension, right? So this is" + }, + { + "start": 2691.0, + "text": "really the the uh ratio of the two" + }, + { + "start": 2693.24, + "text": "dimensions of your W1 and your as well" + }, + { + "start": 2695.28, + "text": "your W2 matrix." + }, + { + "start": 2697.68, + "text": "Um this seems like a thing that's very" + }, + { + "start": 2699.12, + "text": "important and controls kind of the" + }, + { + "start": 2700.48, + "text": "richness of your MLPs. So what should it" + }, + { + "start": 2703.359, + "text": "be? Well, for whatever reason, it should" + }, + { + "start": 2706.04, + "text": "maybe be four times your hidden" + }, + { + "start": 2708.12, + "text": "dimension, right? Um and this is a rule" + }, + { + "start": 2711.88, + "text": "of thumb that works remarkably well and" + }, + { + "start": 2714.16, + "text": "I will show you some data on like why" + }, + { + "start": 2716.68, + "text": "maybe this is a fine number to choose." + }, + { + "start": 2718.4, + "text": "There's a few exceptions um and funnily" + }, + { + "start": 2720.96, + "text": "enough, the really extreme exceptions" + }, + { + "start": 2722.68, + "text": "kind of backtrack on that." + }, + { + "start": 2726.04, + "text": "Um exception number one is variants of" + }, + { + "start": 2729.16, + "text": "the gated linear unit. I already told" + }, + { + "start": 2731.0, + "text": "you about this. So if you were thinking" + }, + { + "start": 2732.2, + "text": "about it, this is probably cached in" + }, + { + "start": 2733.88, + "text": "your head, right? GLUs have more" + }, + { + "start": 2736.32, + "text": "parameters, right? If you keep the same" + }, + { + "start": 2737.88, + "text": "dimensions. So if you want to keep the" + }, + { + "start": 2739.8, + "text": "parameter size of your MLPs the same," + }, + { + "start": 2742.08, + "text": "well, you need to scale down by 2/3," + }, + { + "start": 2744.32, + "text": "right? So most GLU variants, this means" + }, + { + "start": 2747.56, + "text": "that you're going to end up with" + }, + { + "start": 2748.52, + "text": "something like 2.67-ish," + }, + { + "start": 2751.4, + "text": "right? So everyone that's uh down here" + }, + { + "start": 2754.12, + "text": "2.67 to 2.5, this is roughly applying" + }, + { + "start": 2757.88, + "text": "this like 2/3 correction." + }, + { + "start": 2760.56, + "text": "Um and then for whatever reason, um the" + }, + { + "start": 2763.8, + "text": "Llama 2 folks decided, well, we actually" + }, + { + "start": 2767.48, + "text": "have very efficient um attention heads" + }, + { + "start": 2770.04, + "text": "with like um" + }, + { + "start": 2771.72, + "text": "uh" + }, + { + "start": 2772.56, + "text": "uh MQ A, which I'll talk about later. Um" + }, + { + "start": 2775.68, + "text": "and because of that, we can multiply" + }, + { + "start": 2777.68, + "text": "this ratio by an arbitrary 1.33 and" + }, + { + "start": 2780.2, + "text": "we'll get roughly 3.5. And so the Llama" + }, + { + "start": 2782.24, + "text": "people kind of like arbitrarily chose a" + }, + { + "start": 2783.76, + "text": "slightly different ratio, which" + }, + { + "start": 2785.32, + "text": "essentially emphasizes the MLPs a little" + }, + { + "start": 2787.32, + "text": "bit more. Um but really if you actually" + }, + { + "start": 2789.84, + "text": "look through all of the the papers," + }, + { + "start": 2791.4, + "text": "you'll find you know, either 2.6-ish or" + }, + { + "start": 2793.8, + "text": "3.5 for GLUs um or four if you're doing" + }, + { + "start": 2797.92, + "text": "uh non-GLU models." + }, + { + "start": 2801.76, + "text": "Okay. There's another exception, which I" + }, + { + "start": 2803.48, + "text": "find to be very funny but also very very" + }, + { + "start": 2805.4, + "text": "cool, which is um" + }, + { + "start": 2807.52, + "text": "you know, throughout as you read these" + }, + { + "start": 2809.24, + "text": "like technical reports, you'll find that" + }, + { + "start": 2811.48, + "text": "most people are just very boring in" + }, + { + "start": 2813.52, + "text": "their choice of architectures. They're" + }, + { + "start": 2814.8, + "text": "like we did Llama but we changed one" + }, + { + "start": 2816.52, + "text": "thing. Um but you know, folks at Google" + }, + { + "start": 2819.52, + "text": "are very bold sometimes um and T5 is one" + }, + { + "start": 2822.68, + "text": "of my favorite ones because they have" + }, + { + "start": 2824.32, + "text": "some really bold settings. Uh they" + }, + { + "start": 2826.8, + "text": "decided that um instead of following" + }, + { + "start": 2828.88, + "text": "this like 4x rule of thumb, they decided" + }, + { + "start": 2831.48, + "text": "that they want to have a 64x" + }, + { + "start": 2833.84, + "text": "multiplier, which is like way bigger" + }, + { + "start": 2835.84, + "text": "than four. Um and they have a reasonable" + }, + { + "start": 2838.52, + "text": "argument for this as well. This is" + }, + { + "start": 2839.56, + "text": "another like systems-based argument," + }, + { + "start": 2841.4, + "text": "right? They said, well, you know, if the" + }, + { + "start": 2843.52, + "text": "bigger my matrix multiplies, the more" + }, + { + "start": 2845.2, + "text": "efficient I can keep my hardware. So if" + }, + { + "start": 2847.8, + "text": "I make this, you know, multiplier really" + }, + { + "start": 2849.96, + "text": "big, then you know, my matrix multiplies" + }, + { + "start": 2853.28, + "text": "can potentially be sort of more" + }, + { + "start": 2855.6, + "text": "efficiently utilized, right?" + }, + { + "start": 2857.96, + "text": "Um and some others like Gemma 2 have" + }, + { + "start": 2860.88, + "text": "also tried to really push a little bit" + }, + { + "start": 2862.8, + "text": "higher on this. But really uh T5 is an" + }, + { + "start": 2866.12, + "text": "kind of astounding exception at 64. I" + }, + { + "start": 2868.32, + "text": "don't think any other model has really" + }, + { + "start": 2870.08, + "text": "gone that high in the feed forward" + }, + { + "start": 2871.84, + "text": "multiplier." + }, + { + "start": 2874.88, + "text": "and empirically, if you look at other" + }, + { + "start": 2877.6, + "text": "sort of works that try to do more" + }, + { + "start": 2879.2, + "text": "controlled comparisons of this ratio, um" + }, + { + "start": 2881.56, + "text": "I've taken this one from Kaplan in 2020." + }, + { + "start": 2883.48, + "text": "This is the classic uh neural scaling" + }, + { + "start": 2886.12, + "text": "laws paper um where they they do sort of" + }, + { + "start": 2889.12, + "text": "various controlled uh studies on" + }, + { + "start": 2891.28, + "text": "language models. You'll see, you know," + }, + { + "start": 2893.12, + "text": "this wasn't the point of the study," + }, + { + "start": 2894.56, + "text": "right? It was a scaling laws study. But" + }, + { + "start": 2896.0, + "text": "you'll see in one of the panels that" + }, + { + "start": 2897.72, + "text": "they actually have a sort of ablation or" + }, + { + "start": 2900.0, + "text": "sweep where they change the feed forward" + }, + { + "start": 2901.84, + "text": "ratio and they look at the loss, right?" + }, + { + "start": 2904.12, + "text": "Um for a very small model here, right?" + }, + { + "start": 2906.44, + "text": "But what they what they find in this" + }, + { + "start": 2908.0, + "text": "paper is there's a basin where you start" + }, + { + "start": 2910.96, + "text": "at about one and you end up about maybe" + }, + { + "start": 2912.6, + "text": "10 where this hyperparameter is like" + }, + { + "start": 2914.8, + "text": "pretty good and very very flat. You lose" + }, + { + "start": 2917.04, + "text": "very little relative to the optimal loss" + }, + { + "start": 2919.96, + "text": "down here, right? Um and then if you get" + }, + { + "start": 2922.32, + "text": "it really wrong, like you get, you know," + }, + { + "start": 2923.84, + "text": "above 10 to 100 or something like that," + }, + { + "start": 2926.2, + "text": "you know, then your loss starts really" + }, + { + "start": 2927.359, + "text": "shooting up quadratically." + }, + { + "start": 2929.16, + "text": "Um and so a lot of these choices that" + }, + { + "start": 2931.12, + "text": "range between like 2.6 to four, they're" + }, + { + "start": 2933.6, + "text": "all kind of falling into this relatively" + }, + { + "start": 2935.72, + "text": "nice basin. So you're fine choosing" + }, + { + "start": 2937.68, + "text": "those numbers, right?" + }, + { + "start": 2939.88, + "text": "Okay. So what can we learn about this" + }, + { + "start": 2942.04, + "text": "hyperparameter? Well, the default" + }, + { + "start": 2943.76, + "text": "choices have worked very well for nearly" + }, + { + "start": 2945.84, + "text": "all modern language models. So you can" + }, + { + "start": 2947.44, + "text": "safely choose that. Um T5 was a fine" + }, + { + "start": 2950.28, + "text": "model or the Virgin 1 T5 was a fine" + }, + { + "start": 2952.32, + "text": "model, right? Like it wasn't a bad" + }, + { + "start": 2953.55, + "text": ", right? Like it wasn't a bad" + }, + { + "start": 2953.56, + "text": "model. Um and so even radical choices" + }, + { + "start": 2955.79, + "text": ". Um and so even radical choices" + }, + { + "start": 2955.8, + "text": "can technically work, but it's probably" + }, + { + "start": 2958.48, + "text": "going to be compute inefficient. And I" + }, + { + "start": 2960.32, + "text": "think the funniest part of the saga of" + }, + { + "start": 2961.96, + "text": "this kind of the punchline of the T5" + }, + { + "start": 2963.56, + "text": "saga to me is that they have a follow-up" + }, + { + "start": 2965.56, + "text": "model T5 1.v v1.1 um that's like" + }, + { + "start": 2969.2, + "text": "supposed to be the improved version of" + }, + { + "start": 2970.4, + "text": "T5 and they kind of go back to the" + }, + { + "start": 2972.76, + "text": "standard 2.5 multiplier, you know? So" + }, + { + "start": 2974.88, + "text": "there's nothing explicitly stated here," + }, + { + "start": 2976.76, + "text": "but clearly, you know, when they tried" + }, + { + "start": 2978.32, + "text": "to to update T5, they decided that they" + }, + { + "start": 2980.44, + "text": "wanted to go back to a more standard" + }, + { + "start": 2981.96, + "text": "multiplier, which I find to be a little" + }, + { + "start": 2983.36, + "text": "bit funny." + }, + { + "start": 2985.08, + "text": "Okay. So, that's the, you know," + }, + { + "start": 2987.32, + "text": "feed-forward ratio," + }, + { + "start": 2989.08, + "text": "um which now you have like a rough sense" + }, + { + "start": 2991.0, + "text": "of like what the right order of" + }, + { + "start": 2992.0, + "text": "magnitude is." + }, + { + "start": 2993.52, + "text": "Now, let's talk about a different" + }, + { + "start": 2995.04, + "text": "consensus hyperparameter. Um I always" + }, + { + "start": 2997.52, + "text": "found this to be very strange when sort" + }, + { + "start": 2999.6, + "text": "of teaching uh 224N and, you know, just" + }, + { + "start": 3002.92, + "text": "sort of teaching students about this," + }, + { + "start": 3004.88, + "text": "which is, if you have a multi-head" + }, + { + "start": 3006.32, + "text": "attention, where you have multiple heads" + }, + { + "start": 3008.12, + "text": "for your attention in your transformer," + }, + { + "start": 3010.76, + "text": "um the canonical thing to do, the thing" + }, + { + "start": 3012.52, + "text": "that almost everyone does, is if you" + }, + { + "start": 3014.92, + "text": "have multiple heads, you make sure that" + }, + { + "start": 3017.76, + "text": "the size of those heads, the head" + }, + { + "start": 3019.32, + "text": "dimension, is such that you sort of have" + }, + { + "start": 3021.88, + "text": "the same dimension as a single-head" + }, + { + "start": 3023.68, + "text": "transformer, right? So, you always make" + }, + { + "start": 3025.2, + "text": "sure that you sort of divide the hidden" + }, + { + "start": 3028.6, + "text": "dimension to basically multiply with H." + }, + { + "start": 3031.2, + "text": "So, in this case, right, you have H, the" + }, + { + "start": 3032.68, + "text": "number of heads, and the dimension of" + }, + { + "start": 3034.4, + "text": "each head is D over H, so you multiply" + }, + { + "start": 3036.2, + "text": "the two and you get D, right? For some" + }, + { + "start": 3037.8, + "text": "reason, this is kind of the rule of" + }, + { + "start": 3038.92, + "text": "thumb." + }, + { + "start": 3039.88, + "text": "Um of course, this doesn't have to be" + }, + { + "start": 3041.44, + "text": "true. We can arbitrarily change the the" + }, + { + "start": 3043.36, + "text": "ratios between head dimensions and model" + }, + { + "start": 3045.12, + "text": "dimensions, but most models do follow" + }, + { + "start": 3047.8, + "text": "this guideline, and it turns out to work" + }, + { + "start": 3049.76, + "text": "pretty well. Um" + }, + { + "start": 3051.6, + "text": "You know, we can look at a variety of" + }, + { + "start": 3052.8, + "text": "different models, classic and new. I," + }, + { + "start": 3054.48, + "text": "you know, have the latest and greatest" + }, + { + "start": 3056.2, + "text": "quad as well, and you kind of find," + }, + { + "start": 3057.92, + "text": "yeah, the ratios are roughly around one" + }, + { + "start": 3060.32, + "text": "a model head. Um you know, notable" + }, + { + "start": 3062.68, + "text": "exception of T5, um and even Lambda," + }, + { + "start": 3065.76, + "text": "which is another Google model, um but" + }, + { + "start": 3068.08, + "text": "really everyone sticks around one. And I" + }, + { + "start": 3069.88, + "text": "think this is" + }, + { + "start": 3071.0, + "text": "kind of an interesting one." + }, + { + "start": 3073.12, + "text": "Um I think the thing about head" + }, + { + "start": 3074.92, + "text": "dimensions uh that I'll that I'll end" + }, + { + "start": 3076.96, + "text": "with here is I think this is yet another" + }, + { + "start": 3078.76, + "text": "kind of forgiving hyperparameter. Um" + }, + { + "start": 3080.92, + "text": "there's a couple of ablations that" + }, + { + "start": 3082.04, + "text": "people have done. There's once again a" + }, + { + "start": 3083.64, + "text": "pretty wide basin around one that you" + }, + { + "start": 3085.8, + "text": "can sort of get away with." + }, + { + "start": 3087.76, + "text": "Okay, but that one's maybe not the most" + }, + { + "start": 3089.64, + "text": "critical uh hyperparameter." + }, + { + "start": 3091.96, + "text": "I think maybe one of the most critical" + }, + { + "start": 3093.52, + "text": "and interesting ones, I think" + }, + { + "start": 3094.84, + "text": "conceptually, is this idea of an aspect" + }, + { + "start": 3097.0, + "text": "ratio, right? Um and then sort of to add" + }, + { + "start": 3099.68, + "text": "an extra point here," + }, + { + "start": 3101.28, + "text": "um when you scale models up or down, the" + }, + { + "start": 3103.68, + "text": "way you usually do that is you fix an" + }, + { + "start": 3105.2, + "text": "aspect ratio, like how wide your model" + }, + { + "start": 3107.0, + "text": "is versus how deep it is, and then you" + }, + { + "start": 3108.72, + "text": "make the whole model bigger, right? So," + }, + { + "start": 3110.04, + "text": "the aspect ratio in some sense controls" + }, + { + "start": 3112.32, + "text": "the entire depth-to-width tradeoff as" + }, + { + "start": 3114.72, + "text": "you make models bigger, right?" + }, + { + "start": 3117.12, + "text": "Now, you might wonder how deep should my" + }, + { + "start": 3119.16, + "text": "model be. Like, if you've been following" + }, + { + "start": 3120.84, + "text": "all this stuff on like reasoning and so" + }, + { + "start": 3122.68, + "text": "on, you might think I need a really deep" + }, + { + "start": 3124.64, + "text": "model or really shallow model if I want" + }, + { + "start": 3126.56, + "text": "systems utilization. You might think" + }, + { + "start": 3128.64, + "text": "that there's a lot of sort of variation." + }, + { + "start": 3130.0, + "text": "And there is a lot of variation, um much" + }, + { + "start": 3132.08, + "text": "more so than other hyperparameters, but" + }, + { + "start": 3134.12, + "text": "there's actually like a fairly clear" + }, + { + "start": 3136.08, + "text": "sweet spot that most modern models fall" + }, + { + "start": 3138.88, + "text": "into." + }, + { + "start": 3139.88, + "text": "Um you don't really see models go like" + }, + { + "start": 3142.68, + "text": "too uh too deep, um and you also don't" + }, + { + "start": 3145.76, + "text": "see models go too wide uh in either" + }, + { + "start": 3148.44, + "text": "direction, right? You see most models" + }, + { + "start": 3150.32, + "text": "have a ratio about a hundred um D model" + }, + { + "start": 3152.96, + "text": "over N layers. Um so, about hundred sort" + }, + { + "start": 3156.0, + "text": "of width for every layer that you have." + }, + { + "start": 3159.12, + "text": "I mean, this is true for like GPT-3 or" + }, + { + "start": 3160.72, + "text": "LLaMA or any one of these models." + }, + { + "start": 3164.88, + "text": "and really, I think the considerations" + }, + { + "start": 3167.24, + "text": "are partly a tradeoff between" + }, + { + "start": 3168.68, + "text": "expressiveness and hardware, right? If" + }, + { + "start": 3170.76, + "text": "you have an extremely extremely deep" + }, + { + "start": 3172.6, + "text": "model, um they get very very annoying to" + }, + { + "start": 3175.28, + "text": "deal with systems-wise. The deeper your" + }, + { + "start": 3177.04, + "text": "model, like, what is the ways that you" + }, + { + "start": 3178.76, + "text": "have for parallelizing them? Well, you" + }, + { + "start": 3180.72, + "text": "might have to cut up your layers. If you" + }, + { + "start": 3182.56, + "text": "cut up your layers," + }, + { + "start": 3184.2, + "text": "we'll talk about this in the systems" + }, + { + "start": 3185.44, + "text": "lecture. Once you start cutting up your" + }, + { + "start": 3186.84, + "text": "layers um depth-wise, you have very" + }, + { + "start": 3189.76, + "text": "serious issues in parallelization." + }, + { + "start": 3191.52, + "text": "Pipeline parallel, which is what this is" + }, + { + "start": 3193.0, + "text": "called, is something that like most" + }, + { + "start": 3194.92, + "text": "people really really do not want to deal" + }, + { + "start": 3196.8, + "text": "with. Whereas width is much easier to" + }, + { + "start": 3199.32, + "text": "parallelize. If you have a really wide" + }, + { + "start": 3200.76, + "text": "model, you know, you can cut that up" + }, + { + "start": 3202.64, + "text": "very easily in your GPUs. Uh tensor" + }, + { + "start": 3205.08, + "text": "parallel is what it's called is much" + }, + { + "start": 3206.48, + "text": "much simpler to deal with." + }, + { + "start": 3209.56, + "text": "and so, in some sense, you know, there's" + }, + { + "start": 3211.12, + "text": "systems reasons to go wide, and maybe" + }, + { + "start": 3212.96, + "text": "there's expressiveness reasons to go" + }, + { + "start": 3214.44, + "text": "deep, and you end up at roughly a" + }, + { + "start": 3217.0, + "text": "hundred. Um and I think one of the" + }, + { + "start": 3220.56, + "text": "really interesting things about um" + }, + { + "start": 3223.04, + "text": "transformer hyperparameters is there are" + }, + { + "start": 3225.04, + "text": "a lot of hyperparameters that seem quite" + }, + { + "start": 3226.6, + "text": "important, but they're also fairly" + }, + { + "start": 3228.44, + "text": "forgiving, and people have converged" + }, + { + "start": 3230.12, + "text": "roughly on the minimum. This is yet" + }, + { + "start": 3232.16, + "text": "another plot from Kaplan et al., um" + }, + { + "start": 3234.28, + "text": "which shows another sweep over" + }, + { + "start": 3235.6, + "text": "hyperparameters for differently sized" + }, + { + "start": 3238.68, + "text": "Um and once again, you see, regardless" + }, + { + "start": 3240.56, + "text": "of kind of the size of your model," + }, + { + "start": 3242.52, + "text": "roughly speaking, the optimum aspect" + }, + { + "start": 3244.48, + "text": "ratio is fairly similar, and they live" + }, + { + "start": 3247.96, + "text": "at about a hundred, maybe a little bit" + }, + { + "start": 3249.6, + "text": "less depending on how you want to do the" + }, + { + "start": 3250.88, + "text": "accounting, but really, you know," + }, + { + "start": 3252.56, + "text": "anywhere near a hundred is a pretty safe" + }, + { + "start": 3254.52, + "text": "bet for aspect ratios." + }, + { + "start": 3257.4, + "text": "Um ETA and others uh did a number of" + }, + { + "start": 3261.08, + "text": "really interesting sort of like" + }, + { + "start": 3262.32, + "text": "architecture um" + }, + { + "start": 3265.12, + "text": "architecture variation experiments, in" + }, + { + "start": 3267.2, + "text": "which their general conclusion on this" + }, + { + "start": 3269.72, + "text": "was that let's look at the top panel" + }, + { + "start": 3271.96, + "text": "here. You have a lot of different kinds" + }, + { + "start": 3273.8, + "text": "of" + }, + { + "start": 3274.76, + "text": "uh models that you can have in terms of" + }, + { + "start": 3276.2, + "text": "depth-to-width tradeoffs, um but as you" + }, + { + "start": 3278.72, + "text": "sort of sweep the depth-to-width" + }, + { + "start": 3280.0, + "text": "tradeoffs, you find that really, um the" + }, + { + "start": 3282.88, + "text": "only thing that matters in some sense is" + }, + { + "start": 3284.52, + "text": "FLOPs. As you increase the FLOPs, the" + }, + { + "start": 3286.04, + "text": "models get better, and that's really" + }, + { + "start": 3287.76, + "text": "controlling the majority of the effects," + }, + { + "start": 3289.64, + "text": "not necessarily uh the aspect ratio. And" + }, + { + "start": 3292.4, + "text": "so, I think what has really emerged from" + }, + { + "start": 3294.48, + "text": "this is the sense that there's a general" + }, + { + "start": 3296.6, + "text": "forgiving band of hyperparameters that" + }, + { + "start": 3298.4, + "text": "people tend to choose, and then you" + }, + { + "start": 3300.2, + "text": "really worry about primarily your" + }, + { + "start": 3301.92, + "text": "systems utilization rather than sort of" + }, + { + "start": 3304.359, + "text": "expressiveness concerns, which are hard" + }, + { + "start": 3306.08, + "text": "to reason about." + }, + { + "start": 3309.8, + "text": "Um okay. And then maybe the last" + }, + { + "start": 3312.52, + "text": "hyperparameter thing uh that I want to" + }, + { + "start": 3314.6, + "text": "mention is vocabulary sizes. Um" + }, + { + "start": 3318.16, + "text": "and this one's kind of interesting to me" + }, + { + "start": 3320.2, + "text": "because there's a really clear" + }, + { + "start": 3321.359, + "text": "difference between two classes of" + }, + { + "start": 3323.2, + "text": "models. Um I think in the early days of" + }, + { + "start": 3326.359, + "text": "a lot of, you know, um" + }, + { + "start": 3328.28, + "text": "uh early days of open-source model" + }, + { + "start": 3330.96, + "text": "training, um" + }, + { + "start": 3332.6, + "text": "there were a lot of monolingual models" + }, + { + "start": 3334.16, + "text": "whose only goal was to be good on" + }, + { + "start": 3335.88, + "text": "English. And for those models, you had" + }, + { + "start": 3338.04, + "text": "these like much smaller vocab size, in" + }, + { + "start": 3340.16, + "text": "the 30,000 range. Um and then," + }, + { + "start": 3343.16, + "text": "post-LLaMA, a lot of people were really" + }, + { + "start": 3345.6, + "text": "interested in multilingual or like" + }, + { + "start": 3347.12, + "text": "production systems. So, these include" + }, + { + "start": 3348.64, + "text": "closed-source models like GPT-4. Um all" + }, + { + "start": 3351.2, + "text": "these have much much larger vocab sizes," + }, + { + "start": 3353.28, + "text": "and these are roughly in the hundred to" + }, + { + "start": 3355.0, + "text": "200,000" + }, + { + "start": 3356.48, + "text": "um vocab range." + }, + { + "start": 3358.56, + "text": "And you see generally that, you know," + }, + { + "start": 3359.64, + "text": "Google models have a ton more vocab. Um" + }, + { + "start": 3362.16, + "text": "LLaMA derivatives roughly range at about" + }, + { + "start": 3364.0, + "text": "a hundred uh thousand tokens, and then" + }, + { + "start": 3366.92, + "text": "the the sort of monolingual models are" + }, + { + "start": 3368.44, + "text": "about 30,000." + }, + { + "start": 3370.48, + "text": "Um this is somewhat clear. The" + }, + { + "start": 3372.08, + "text": "multilingual models really do need much" + }, + { + "start": 3373.8, + "text": "larger vocab to cover the whole space." + }, + { + "start": 3376.08, + "text": "Generally, the models on the right are" + }, + { + "start": 3377.44, + "text": "also bigger. There have been scaling law" + }, + { + "start": 3379.12, + "text": "studies showing that the bigger your" + }, + { + "start": 3380.52, + "text": "model, the larger the vocab it can" + }, + { + "start": 3382.76, + "text": "handle, and so this is also partially" + }, + { + "start": 3384.76, + "text": "driven by uh modern scaling trends," + }, + { + "start": 3387.04, + "text": "where the models on the right are" + }, + { + "start": 3388.0, + "text": "generally bigger. No one's training" + }, + { + "start": 3389.44, + "text": "small uh sorry, large monolingual models" + }, + { + "start": 3392.16, + "text": "uh anymore." + }, + { + "start": 3395.56, + "text": "Okay. So, um yeah." + }, + { + "start": 3408.92, + "text": "Sorry, uh the question was like, if you" + }, + { + "start": 3410.56, + "text": "have Sorry, multilingual models or" + }, + { + "start": 3412.72, + "text": "sorry? Multimodal. Multimodal. Yeah, so" + }, + { + "start": 3415.44, + "text": "I guess it depends on the way that your" + }, + { + "start": 3417.72, + "text": "tokens are encoded, but, you know, if" + }, + { + "start": 3419.28, + "text": "you're tokenizing your images and things" + }, + { + "start": 3420.56, + "text": "like that, then you need to, you know," + }, + { + "start": 3421.92, + "text": "have many more tokens to account for" + }, + { + "start": 3423.44, + "text": "those. Um if you look at like various" + }, + { + "start": 3425.48, + "text": "open-source releases, they'll have like" + }, + { + "start": 3426.84, + "text": "a different image tokenizer with its own" + }, + { + "start": 3428.32, + "text": "vocab, which is quite large." + }, + { + "start": 3431.64, + "text": "Uh how valid is it to" + }, + { + "start": 3433.88, + "text": "compare" + }, + { + "start": 3436.48, + "text": "bits uh bits per byte for different for" + }, + { + "start": 3439.04, + "text": "different tokenizers?" + }, + { + "start": 3441.76, + "text": "how valid is it to compare bits Oh, that" + }, + { + "start": 3443.44, + "text": "is a great question. Okay, yeah. Uh" + }, + { + "start": 3445.16, + "text": "that's not a hyperparameter question," + }, + { + "start": 3446.32, + "text": "but that is a good question. Um so," + }, + { + "start": 3449.52, + "text": "what is the right way? Okay, so so, let" + }, + { + "start": 3450.96, + "text": "me let me like" + }, + { + "start": 3452.32, + "text": "step back a moment and like put us in" + }, + { + "start": 3453.68, + "text": "the right mindset. So, if we think about" + }, + { + "start": 3456.8, + "text": "language modeling, language modeling is" + }, + { + "start": 3458.6, + "text": "is a generative modeling task, right? We" + }, + { + "start": 3460.16, + "text": "are modeling the probability of a" + }, + { + "start": 3461.359, + "text": "sequence." + }, + { + "start": 3462.44, + "text": "Now, as long as your sequence is fixed," + }, + { + "start": 3464.6, + "text": "right? It's the same. You have you know," + }, + { + "start": 3465.96, + "text": "adulterated it anyway, and you provide a" + }, + { + "start": 3468.2, + "text": "probability over all strings, that's" + }, + { + "start": 3470.28, + "text": "always valid to compare, right? At that" + }, + { + "start": 3472.24, + "text": "level of things, it's always valid." + }, + { + "start": 3474.2, + "text": "Now, when you ask the question, is it" + }, + { + "start": 3476.16, + "text": "valid to compare the bits per byte of uh" + }, + { + "start": 3479.12, + "text": "arbitrary token or or two arbitrary" + }, + { + "start": 3481.0, + "text": "tokenizers?" + }, + { + "start": 3482.68, + "text": "Really, there's two things at play. The" + }, + { + "start": 3484.44, + "text": "one thing is, you know, did you touch" + }, + { + "start": 3486.28, + "text": "the sequence at all? Like, if you look" + }, + { + "start": 3487.72, + "text": "at some tokenizers in the past, before" + }, + { + "start": 3489.8, + "text": "subword tokenizers, they would drop some" + }, + { + "start": 3491.88, + "text": "tokens or drop some words. That changes" + }, + { + "start": 3494.2, + "text": "that makes the comparisons invalid. But" + }, + { + "start": 3496.12, + "text": "modern tokenizers are complete. They can" + }, + { + "start": 3497.76, + "text": "model any sequence, so that's not a" + }, + { + "start": 3499.12, + "text": "concern. Um the other thing that you" + }, + { + "start": 3501.32, + "text": "have to worry about is, are we like" + }, + { + "start": 3503.4, + "text": "length normalizing it in any way, right?" + }, + { + "start": 3505.64, + "text": "But for bits per byte, you're always" + }, + { + "start": 3506.92, + "text": "normalizing with the same number, which" + }, + { + "start": 3508.8, + "text": "is the number of bytes, and so this is" + }, + { + "start": 3510.44, + "text": "always a valid comparison, right? So," + }, + { + "start": 3511.84, + "text": "that's kind of how to think about, you" + }, + { + "start": 3513.4, + "text": "know, tokenizer comparisons. So, for" + }, + { + "start": 3514.84, + "text": "example," + }, + { + "start": 3515.84, + "text": "uh I think they had the results showing" + }, + { + "start": 3517.88, + "text": "that" + }, + { + "start": 3519.16, + "text": "comparing perplexity for fixed" + }, + { + "start": 3520.68, + "text": "tokenizers" + }, + { + "start": 3522.16, + "text": "is is is is um" + }, + { + "start": 3524.8, + "text": "always leads to to better actual" + }, + { + "start": 3526.28, + "text": "performance." + }, + { + "start": 3527.04, + "text": "On on downstream network tasks." + }, + { + "start": 3529.2, + "text": "Is the same thing they were looking for?" + }, + { + "start": 3532.2, + "text": "Um perplexity and BPD are kind of dual" + }, + { + "start": 3534.56, + "text": "to each other, so yes, if that's what" + }, + { + "start": 3536.6, + "text": "you're asking." + }, + { + "start": 3538.6, + "text": "It's only yes and only no, cuz if you're" + }, + { + "start": 3540.8, + "text": "comparing" + }, + { + "start": 3544.52, + "text": "you could two frame compare the" + }, + { + "start": 3546.08, + "text": "perplexity as compared" + }, + { + "start": 3548.2, + "text": "but you're changing it different splits." + }, + { + "start": 3550.92, + "text": "Changing it different [snorts]" + }, + { + "start": 3552.16, + "text": "Okay, we'll have to we'll have to talk" + }, + { + "start": 3553.48, + "text": "later cuz I'm not sure I understand the" + }, + { + "start": 3554.92, + "text": "question, but I think that that's an" + }, + { + "start": 3556.44, + "text": "interesting set of questions. Okay," + }, + { + "start": 3557.88, + "text": "good." + }, + { + "start": 3560.32, + "text": "All right." + }, + { + "start": 3561.8, + "text": "So, um you know, we're we're going" + }, + { + "start": 3563.76, + "text": "through really the the the low-level" + }, + { + "start": 3566.4, + "text": "lowest levels of details of language" + }, + { + "start": 3568.12, + "text": "modeling, which I think has really" + }, + { + "start": 3569.68, + "text": "exposed a lot of interesting ideas while" + }, + { + "start": 3571.68, + "text": "we sort of talk through this. And I" + }, + { + "start": 3573.36, + "text": "think dropout is one of the end" + }, + { + "start": 3574.92, + "text": "regularization, I think is another very" + }, + { + "start": 3577.6, + "text": "interesting class of ideas. Also one" + }, + { + "start": 3579.92, + "text": "that I think is very counterintuitive" + }, + { + "start": 3581.96, + "text": "from your machine learning 101" + }, + { + "start": 3583.36, + "text": "intuition." + }, + { + "start": 3584.72, + "text": "So, let's uh" + }, + { + "start": 3586.16, + "text": "go through what I think is like the the" + }, + { + "start": 3588.08, + "text": "standard argument for, you know," + }, + { + "start": 3590.28, + "text": "regularization. Um well, if I'm doing" + }, + { + "start": 3592.64, + "text": "language modeling, I have a lot of data," + }, + { + "start": 3594.4, + "text": "right? I have more data than I can" + }, + { + "start": 3595.72, + "text": "process most of the time, right? Unless" + }, + { + "start": 3597.76, + "text": "you're at, you know, Google, maybe even" + }, + { + "start": 3599.52, + "text": "then, there is more internet data than" + }, + { + "start": 3601.76, + "text": "there is flops. So, you know, I'm" + }, + { + "start": 3604.08, + "text": "probably not even going to see the same" + }, + { + "start": 3605.32, + "text": "data twice, right? Um so, I'm only going" + }, + { + "start": 3608.8, + "text": "to do a single pass on a corpus, and" + }, + { + "start": 3610.44, + "text": "there's very good reasons and arguments" + }, + { + "start": 3612.68, + "text": "to believe that a single pass of SGD or" + }, + { + "start": 3614.88, + "text": "other optimizers is never really going" + }, + { + "start": 3617.2, + "text": "to memorize my data very much, right?" + }, + { + "start": 3619.56, + "text": "So, this means overfitting is not really" + }, + { + "start": 3621.92, + "text": "a problem uh almost ever during compute" + }, + { + "start": 3624.76, + "text": "constrained language modeling." + }, + { + "start": 3626.92, + "text": "Now, you know, some people even actually" + }, + { + "start": 3629.16, + "text": "only look at training loss because they" + }, + { + "start": 3631.08, + "text": "believe so strongly that overfitting" + }, + { + "start": 3633.0, + "text": "doesn't happen in single pass SGD." + }, + { + "start": 3635.36, + "text": "Now, given this, you know, you can sort" + }, + { + "start": 3636.92, + "text": "of sit and think about this. Should I" + }, + { + "start": 3639.16, + "text": "use dropout or weight decay in language" + }, + { + "start": 3641.68, + "text": "model training, right?" + }, + { + "start": 3643.76, + "text": "Okay, you can think about it a bit." + }, + { + "start": 3647.84, + "text": "you know, one unfortunate thing is that" + }, + { + "start": 3649.6, + "text": "a lot of recent models don't talk about" + }, + { + "start": 3651.28, + "text": "this stuff at all. Um it's really" + }, + { + "start": 3653.68, + "text": "lower-level details than like tech" + }, + { + "start": 3655.88, + "text": "reports are willing to expose." + }, + { + "start": 3657.84, + "text": "Um but if you look, actually you find a" + }, + { + "start": 3660.56, + "text": "lot of models um do both. Especially" + }, + { + "start": 3663.68, + "text": "weight decay actually is a fairly" + }, + { + "start": 3665.44, + "text": "popular intervention even for modern" + }, + { + "start": 3668.8, + "text": "high-performance language models. Um" + }, + { + "start": 3671.92, + "text": "this is very, very surprising, you know?" + }, + { + "start": 3674.16, + "text": "I mean, some of the dropout things um" + }, + { + "start": 3677.24, + "text": "you know, maybe" + }, + { + "start": 3679.08, + "text": "uh have gone out of favor, but weight" + }, + { + "start": 3681.4, + "text": "decay actually remains fairly popular." + }, + { + "start": 3683.36, + "text": "And this is very mystifying. Like, why" + }, + { + "start": 3685.0, + "text": "is this?" + }, + { + "start": 3686.24, + "text": "Um and this is, you know, one of the" + }, + { + "start": 3687.8, + "text": "reasons why I think deep learning is" + }, + { + "start": 3689.12, + "text": "hard and this architecture lecture is" + }, + { + "start": 3691.08, + "text": "very strange and hard. Um it's because" + }, + { + "start": 3693.68, + "text": "these things interact in very strange" + }, + { + "start": 3695.48, + "text": "ways. So, there have been papers that" + }, + { + "start": 3698.0, + "text": "have argued um and shown nice evidence" + }, + { + "start": 3701.08, + "text": "that weight decay is actually not a" + }, + { + "start": 3703.8, + "text": "regularizer sometimes. It actually" + }, + { + "start": 3706.48, + "text": "interacts with the optimizer to" + }, + { + "start": 3709.96, + "text": "essentially um" + }, + { + "start": 3711.8, + "text": "make optimization better. Um so, if you" + }, + { + "start": 3715.0, + "text": "look at the training versus validation" + }, + { + "start": 3717.56, + "text": "loss across different weight decay" + }, + { + "start": 3719.16, + "text": "settings on, you know, language model" + }, + { + "start": 3721.12, + "text": "training for single pass SGD, you don't" + }, + { + "start": 3723.2, + "text": "really see any difference. Like, weight" + }, + { + "start": 3724.48, + "text": "decay isn't shifting things so the" + }, + { + "start": 3726.08, + "text": "validation loss is better. There's" + }, + { + "start": 3727.36, + "text": "already no overfitting. We're on the x" + }, + { + "start": 3728.88, + "text": "equals y line here, right? So, doesn't" + }, + { + "start": 3731.28, + "text": "control overfitting," + }, + { + "start": 3732.96, + "text": "but if we kind of look at different" + }, + { + "start": 3735.56, + "text": "levels of weight decay, and not only" + }, + { + "start": 3738.0, + "text": "just different levels of weight decay," + }, + { + "start": 3739.44, + "text": "we look at weight decay combined with" + }, + { + "start": 3742.48, + "text": "learning rate decay, um what we find is" + }, + { + "start": 3745.36, + "text": "that the stronger weight decay runs," + }, + { + "start": 3747.52, + "text": "these blue dash lines on the bottom, you" + }, + { + "start": 3750.08, + "text": "know, do significantly better because" + }, + { + "start": 3752.2, + "text": "they start out slow, but they" + }, + { + "start": 3754.8, + "text": "essentially end up um converging to a" + }, + { + "start": 3757.52, + "text": "much better minimum later. And this is," + }, + { + "start": 3759.64, + "text": "you know, generally true when we decay" + }, + { + "start": 3761.48, + "text": "learning rate, not necessarily true when" + }, + { + "start": 3764.16, + "text": "we're in constant learning rate, which" + }, + { + "start": 3765.52, + "text": "is maybe somewhat more of where your" + }, + { + "start": 3767.44, + "text": "intuition is coming from." + }, + { + "start": 3769.44, + "text": "So, you know, this is part of why it's" + }, + { + "start": 3771.4, + "text": "very difficult to reason sort of a" + }, + { + "start": 3773.6, + "text": "priori or like from scratch, you know," + }, + { + "start": 3775.84, + "text": "the behavior of all these different" + }, + { + "start": 3777.36, + "text": "choices and why, you know, I think Percy" + }, + { + "start": 3779.64, + "text": "and I have designed this class so that" + }, + { + "start": 3781.16, + "text": "you interact with stuff because, you" + }, + { + "start": 3783.32, + "text": "know, you might come upon this thing" + }, + { + "start": 3785.08, + "text": "that where basically weight decay is" + }, + { + "start": 3786.8, + "text": "actually an optimization intervention" + }, + { + "start": 3788.72, + "text": "and not necessarily a regularization" + }, + { + "start": 3791.52, + "text": "intervention, which is, you know, what" + }, + { + "start": 3792.56, + "text": "you would expect here, right? So, always" + }, + { + "start": 3794.52, + "text": "keep that in mind that these kinds of" + }, + { + "start": 3796.36, + "text": "unexpected effects can really start to" + }, + { + "start": 3798.68, + "text": "kick in" + }, + { + "start": 3799.96, + "text": "uh for these kinds of uh settings." + }, + { + "start": 3806.2, + "text": "All right. So, to put everything" + }, + { + "start": 3807.88, + "text": "together for hyperparameters, there's" + }, + { + "start": 3810.0, + "text": "actually for, you know, a lot of the the" + }, + { + "start": 3811.92, + "text": "maybe more hairy-looking" + }, + { + "start": 3813.12, + "text": "hyperparameters, actually just fairly" + }, + { + "start": 3815.24, + "text": "standard choices that have worked well" + }, + { + "start": 3817.0, + "text": "for everybody, right? You know, factor" + }, + { + "start": 3819.24, + "text": "of four rule of thumb, keep your head" + }, + { + "start": 3821.08, + "text": "dim and your number of heads uh equal to" + }, + { + "start": 3823.36, + "text": "the model dimension, um pick an aspect" + }, + { + "start": 3826.04, + "text": "ratio roughly around 100, um" + }, + { + "start": 3828.76, + "text": "and, you know, if you ask about" + }, + { + "start": 3830.2, + "text": "regularization, right? You want to maybe" + }, + { + "start": 3832.28, + "text": "try a couple things cuz regularization" + }, + { + "start": 3833.88, + "text": "actually does interact with optimizers" + }, + { + "start": 3836.56, + "text": "in ways that are quite counterintuitive," + }, + { + "start": 3838.32, + "text": "right? So, this is the thing that some" + }, + { + "start": 3839.44, + "text": "people uh still do even though you you" + }, + { + "start": 3841.6, + "text": "don't need the regularization at all." + }, + { + "start": 3844.28, + "text": "Actually, maybe I'll stop here in case" + }, + { + "start": 3845.76, + "text": "yeah." + }, + { + "start": 3847.08, + "text": "Are there any significant differences" + }, + { + "start": 3849.2, + "text": "maybe for like um" + }, + { + "start": 3850.8, + "text": "the future models?" + }, + { + "start": 3852.359, + "text": "Ooh, diffusions." + }, + { + "start": 3854.24, + "text": "That I have not looked into enough, to" + }, + { + "start": 3856.08, + "text": "be honest. Um there aren't that many" + }, + { + "start": 3857.88, + "text": "people training big diffusions is one" + }, + { + "start": 3859.359, + "text": "issue. Um and many of the models that" + }, + { + "start": 3861.4, + "text": "have been trained are retrofitted cuz I" + }, + { + "start": 3863.16, + "text": "think the architectures are actually the" + }, + { + "start": 3864.4, + "text": "same as the, you know, like a Lama-like" + }, + { + "start": 3866.32, + "text": "model. Um but if you're asking the" + }, + { + "start": 3867.84, + "text": "question of like, what's the optimal" + }, + { + "start": 3869.359, + "text": "architecture if you were to train from" + }, + { + "start": 3870.8, + "text": "scratch, I don't know what that is" + }, + { + "start": 3872.2, + "text": "actually off the top of my head." + }, + { + "start": 3874.04, + "text": "Yeah. Do you have any explanation for" + }, + { + "start": 3875.8, + "text": "why regularization works in some cases?" + }, + { + "start": 3879.2, + "text": "Well, I guess it's not that" + }, + { + "start": 3880.44, + "text": "regularization in general affects" + }, + { + "start": 3881.8, + "text": "optimization. I don't think people do" + }, + { + "start": 3882.92, + "text": "dropout anymore because, you know, it" + }, + { + "start": 3884.68, + "text": "doesn't really uh interact well with" + }, + { + "start": 3886.359, + "text": "optimization. But for example, weight" + }, + { + "start": 3887.6, + "text": "decay, you know, which is shrinkage to" + }, + { + "start": 3889.28, + "text": "zero, um that might allow you to use a" + }, + { + "start": 3891.6, + "text": "higher learning rate or it might allow" + }, + { + "start": 3893.16, + "text": "you to decay faster. There are lots of" + }, + { + "start": 3895.12, + "text": "ways in which all these terms are" + }, + { + "start": 3896.32, + "text": "interrelated." + }, + { + "start": 3901.92, + "text": "Now, I've talked a lot about how to" + }, + { + "start": 3903.68, + "text": "design um expressive models by sort of" + }, + { + "start": 3907.44, + "text": "looking at all these other models that" + }, + { + "start": 3908.68, + "text": "have been trained." + }, + { + "start": 3909.92, + "text": "Um one of the things that I'll I'll" + }, + { + "start": 3911.4, + "text": "highlight now is over the last few years" + }, + { + "start": 3914.08, + "text": "um a really big emphasis has not been on" + }, + { + "start": 3916.96, + "text": "performance alone. It has actually been" + }, + { + "start": 3918.92, + "text": "on stability. And this becomes an" + }, + { + "start": 3921.04, + "text": "increasingly important concern as your" + }, + { + "start": 3923.2, + "text": "models get more and more expensive to" + }, + { + "start": 3925.28, + "text": "train, right? Um we've kind of seen that" + }, + { + "start": 3927.56, + "text": "a lot of these choices are forgiving," + }, + { + "start": 3929.16, + "text": "right? Everyone's kind of doing similar" + }, + { + "start": 3930.52, + "text": "stuff. And so, you know, you can mess" + }, + { + "start": 3932.96, + "text": "with these, but you're not going to get" + }, + { + "start": 3933.96, + "text": "a big performance difference. That's" + }, + { + "start": 3935.52, + "text": "fine. But if your model, you know," + }, + { + "start": 3937.68, + "text": "suddenly blows up some part into" + }, + { + "start": 3939.64, + "text": "training, like you get these like" + }, + { + "start": 3940.68, + "text": "horrible-looking spikes all over the" + }, + { + "start": 3942.56, + "text": "place, um you know, you might end up" + }, + { + "start": 3944.8, + "text": "with a model that is, you know, actually" + }, + { + "start": 3946.4, + "text": "not very good quality, right? Or it" + }, + { + "start": 3948.52, + "text": "might be unrecoverable. You might have" + }, + { + "start": 3949.88, + "text": "spent, you know, millions of dollars in" + }, + { + "start": 3951.2, + "text": "training, and, you know, you get to a" + }, + { + "start": 3952.96, + "text": "point where the model is no longer able" + }, + { + "start": 3954.72, + "text": "to be trained any further, right? That" + }, + { + "start": 3956.0, + "text": "would be a horrible thing to happen if" + }, + { + "start": 3957.68, + "text": "you have a lot of compute that you want" + }, + { + "start": 3958.8, + "text": "to spend." + }, + { + "start": 3960.28, + "text": "So, you don't want to train models that" + }, + { + "start": 3961.92, + "text": "look kind of like this blue curve with" + }, + { + "start": 3963.28, + "text": "like spikes everywhere and these, you" + }, + { + "start": 3964.96, + "text": "know, big gradient norms happening. Um" + }, + { + "start": 3967.4, + "text": "so, what do we do to fix these stability" + }, + { + "start": 3969.52, + "text": "issues? I mean, this is really, I would" + }, + { + "start": 3971.48, + "text": "say, like a core core issue." + }, + { + "start": 3973.72, + "text": "And, you know, if you have stability" + }, + { + "start": 3975.8, + "text": "issues in language models or in general" + }, + { + "start": 3979.0, + "text": "neural networks, there's a few, you" + }, + { + "start": 3981.32, + "text": "know, usual suspects that you've got to" + }, + { + "start": 3983.32, + "text": "start looking at." + }, + { + "start": 3984.64, + "text": "Um one of them is the soft maxes, and" + }, + { + "start": 3987.0, + "text": "the soft max has two things that are" + }, + { + "start": 3988.88, + "text": "both really bad for stability. One of" + }, + { + "start": 3990.88, + "text": "them is an exponential, right? We can" + }, + { + "start": 3992.32, + "text": "see how that blows up very quickly. Um" + }, + { + "start": 3994.6, + "text": "you also divide two numbers, and that's" + }, + { + "start": 3996.68, + "text": "also a potentially very dangerous" + }, + { + "start": 3998.359, + "text": "operation, right? So, a soft max is one" + }, + { + "start": 4001.2, + "text": "place where you got to be extra, extra" + }, + { + "start": 4002.76, + "text": "careful." + }, + { + "start": 4004.04, + "text": "And where are the soft maxes in a" + }, + { + "start": 4005.4, + "text": "language model? Well, there's two of" + }, + { + "start": 4007.24, + "text": "them. There's one on the output side" + }, + { + "start": 4009.56, + "text": "when we output our probability" + }, + { + "start": 4010.8, + "text": "distribution, and then in attention when" + }, + { + "start": 4012.88, + "text": "we normalize the attention, there's" + }, + { + "start": 4014.359, + "text": "going to be another soft max, right? So," + }, + { + "start": 4016.28, + "text": "we can think of both of those as really" + }, + { + "start": 4017.72, + "text": "kind of danger zones for our model, um" + }, + { + "start": 4021.24, + "text": "especially the attention." + }, + { + "start": 4025.44, + "text": "But okay." + }, + { + "start": 4026.68, + "text": "Let's start with thinking about the" + }, + { + "start": 4027.8, + "text": "output soft max. The output soft max can" + }, + { + "start": 4030.359, + "text": "blow up on us. Um" + }, + { + "start": 4032.48, + "text": "and one of the things that we can do is" + }, + { + "start": 4034.72, + "text": "we can try to control" + }, + { + "start": 4036.76, + "text": "um sort of the the normalizer problem." + }, + { + "start": 4039.04, + "text": "So, you know, let's sort of think about" + }, + { + "start": 4041.12, + "text": "the soft max calculation. We want to" + }, + { + "start": 4042.64, + "text": "compute a log probability to compute the" + }, + { + "start": 4044.64, + "text": "loss. Now, what is a log probability?" + }, + { + "start": 4046.92, + "text": "Well, it's, you know, the output of your" + }, + { + "start": 4048.64, + "text": "model U, and then you've got this log" + }, + { + "start": 4051.08, + "text": "normalizer, right? This U is" + }, + { + "start": 4053.4, + "text": "well-behaved because in some sense this" + }, + { + "start": 4055.04, + "text": "is the output of your model, right? This" + }, + { + "start": 4056.64, + "text": "is just the output of your residual" + }, + { + "start": 4058.12, + "text": "stream with all the things that are" + }, + { + "start": 4059.28, + "text": "added in. So, if U is well-behaved, then" + }, + { + "start": 4061.84, + "text": "log P, the first term, is well-behaved," + }, + { + "start": 4063.84, + "text": "right? If the model is being okay." + }, + { + "start": 4065.56, + "text": "Now, the second term, this log Z, this" + }, + { + "start": 4068.4, + "text": "might not be so okay, right? If Z is" + }, + { + "start": 4070.6, + "text": "really big or really small, even if the" + }, + { + "start": 4072.84, + "text": "output of your model is somewhat" + }, + { + "start": 4074.32, + "text": "well-behaved, it could blow up. And what" + }, + { + "start": 4075.8, + "text": "is Z? Well, it's an exponential, right?" + }, + { + "start": 4077.52, + "text": "So, it could potentially blow up very" + }, + { + "start": 4079.12, + "text": "quickly on you." + }, + { + "start": 4080.32, + "text": "Or if this is zero, it could also blow" + }, + { + "start": 4081.76, + "text": "up on you, right? So, both of those" + }, + { + "start": 4082.8, + "text": "directions are very, very bad." + }, + { + "start": 4084.88, + "text": "Now, we would ideally like our Z to be" + }, + { + "start": 4087.32, + "text": "somewhere near one, right?" + }, + { + "start": 4089.8, + "text": "Um or log Z to be somewhere near zero." + }, + { + "start": 4093.04, + "text": "Um what can we do? Well, one of the" + }, + { + "start": 4095.2, + "text": "things that you notice, right? If you" + }, + { + "start": 4097.0, + "text": "sort of thought about the action of the" + }, + { + "start": 4098.6, + "text": "soft max, is this whole thing is" + }, + { + "start": 4100.719, + "text": "overparameterized, right? Um I could" + }, + { + "start": 4103.48, + "text": "sort of push things in and out. So, if I" + }, + { + "start": 4105.52, + "text": "add a constant to U, I can manipulate" + }, + { + "start": 4108.0, + "text": "the Zs without really affecting the" + }, + { + "start": 4110.12, + "text": "output of the soft max, right? You can" + }, + { + "start": 4111.4, + "text": "cancel out between the normalizer and" + }, + { + "start": 4113.839, + "text": "sort of the output of my model." + }, + { + "start": 4115.799, + "text": "So, because of that property, one thing" + }, + { + "start": 4117.96, + "text": "that I could do is I could add a" + }, + { + "start": 4119.0, + "text": "regularizer. Um this is from from Jacob" + }, + { + "start": 4121.2, + "text": "Devlin's paper 2024, uh sorry, 2014, um" + }, + { + "start": 4124.6, + "text": "in which he adds sort of this squared" + }, + { + "start": 4126.839, + "text": "log Z term. Um" + }, + { + "start": 4129.24, + "text": "and what this is doing is it's just" + }, + { + "start": 4130.799, + "text": "penalizing how far away your log Z is" + }, + { + "start": 4134.04, + "text": "from zero. And if log Z is near zero," + }, + { + "start": 4136.56, + "text": "that's nice because this whole" + }, + { + "start": 4138.08, + "text": "expression is kind of numerically" + }, + { + "start": 4140.12, + "text": "stable." + }, + { + "start": 4141.44, + "text": "This is called the Z loss trick. Um it's" + }, + { + "start": 4143.96, + "text": "been used by a number of papers. Um" + }, + { + "start": 4146.12, + "text": "Jacob Devlin and others uh sort of" + }, + { + "start": 4148.719, + "text": "popular or initially pioneered this back" + }, + { + "start": 4150.759, + "text": "in 2014, and then it's become popular" + }, + { + "start": 4153.16, + "text": "again through a number of open-source" + }, + { + "start": 4154.92, + "text": "models. Like, Baichuan I think was the" + }, + { + "start": 4156.44, + "text": "first open-source model to do it, but" + }, + { + "start": 4158.319, + "text": "then DCLM and Almo and others have been" + }, + { + "start": 4160.28, + "text": "using this trick to stabilize their" + }, + { + "start": 4162.16, + "text": "output soft maxes." + }, + { + "start": 4163.759, + "text": "So, this is this is a surprisingly" + }, + { + "start": 4165.52, + "text": "effective thing." + }, + { + "start": 4167.12, + "text": "Now, okay. So, let's say we've handled" + }, + { + "start": 4169.279, + "text": "the instability issues on the output" + }, + { + "start": 4171.04, + "text": "soft max." + }, + { + "start": 4172.92, + "text": "Now, we have to sort of turn our" + }, + { + "start": 4174.04, + "text": "attention towards the other potential" + }, + { + "start": 4175.44, + "text": "problem, which is attention, right? And" + }, + { + "start": 4177.759, + "text": "this is a a place where lots of" + }, + { + "start": 4180.4, + "text": "degeneracies happen. Lots of techniques" + }, + { + "start": 4182.24, + "text": "have been developed to control the" + }, + { + "start": 4184.4, + "text": "instability that attention operations" + }, + { + "start": 4186.799, + "text": "generate." + }, + { + "start": 4187.96, + "text": "Um and really the, you know, the" + }, + { + "start": 4189.88, + "text": "high-level thing that I'll say" + }, + { + "start": 4192.04, + "text": "is" + }, + { + "start": 4193.08, + "text": "if you have instability, if you can" + }, + { + "start": 4194.76, + "text": "throw a layer norm in there somehow, it" + }, + { + "start": 4196.8, + "text": "might control it. And that's really in" + }, + { + "start": 4198.68, + "text": "some sense the the design philosophy" + }, + { + "start": 4201.36, + "text": "behind this idea called the QK norm. Um" + }, + { + "start": 4204.52, + "text": "so, what you do is remember that we" + }, + { + "start": 4206.04, + "text": "have, you know, our Qs and Ks um that" + }, + { + "start": 4209.2, + "text": "are going to be multiplied together, and" + }, + { + "start": 4211.32, + "text": "then they're going to go into the soft" + }, + { + "start": 4212.48, + "text": "max, right? So, in the standard" + }, + { + "start": 4213.84, + "text": "attention operation, I'm going to layer" + }, + { + "start": 4215.56, + "text": "norm as a pre-layer norm, multiply with" + }, + { + "start": 4218.0, + "text": "a QKV, and then I'm going to get my Qs" + }, + { + "start": 4219.92, + "text": "and Ks. Those will get multiplied by a" + }, + { + "start": 4221.96, + "text": "matrix multiply, I'll soft max them, and" + }, + { + "start": 4224.32, + "text": "I'll multiply that with V to get the" + }, + { + "start": 4226.08, + "text": "weighted average, and then I'll output" + }, + { + "start": 4227.76, + "text": "whatever comes after, right? So, this is" + }, + { + "start": 4229.44, + "text": "our usual attention. Now, what happens" + }, + { + "start": 4232.24, + "text": "if we just throw in a layer norm before" + }, + { + "start": 4235.36, + "text": "we multiply the Qs and Ks? If we do" + }, + { + "start": 4238.2, + "text": "that, then we know that the inputs to" + }, + { + "start": 4240.76, + "text": "this matrix multiply, and therefore the" + }, + { + "start": 4242.52, + "text": "inputs to the soft max, roughly have the" + }, + { + "start": 4245.36, + "text": "same scale. They're always going to have" + }, + { + "start": 4247.08, + "text": "a scale of roughly one because we've," + }, + { + "start": 4249.48, + "text": "you know, used RMS norm to divide the" + }, + { + "start": 4252.0, + "text": "size of those Qs and Ks." + }, + { + "start": 4254.64, + "text": "Okay. If we do that, then, you know," + }, + { + "start": 4257.16, + "text": "we're kind of going to keep this soft" + }, + { + "start": 4259.08, + "text": "max operation stable. Tons of different" + }, + { + "start": 4261.08, + "text": "models do this. It's originally from" + }, + { + "start": 4263.76, + "text": "the multimodal world." + }, + { + "start": 4266.52, + "text": "You know, some folks who were doing" + }, + { + "start": 4268.24, + "text": "making multimodal models sort of" + }, + { + "start": 4269.56, + "text": "initially discovered QK norm." + }, + { + "start": 4271.84, + "text": "E to fix and chameleon really, you know," + }, + { + "start": 4274.36, + "text": "used this and like proved it out. And" + }, + { + "start": 4276.32, + "text": "then a number of other open-source" + }, + { + "start": 4278.4, + "text": "language models, you know, realized that" + }, + { + "start": 4280.16, + "text": "the same tricks are entirely applicable" + }, + { + "start": 4282.48, + "text": "to" + }, + { + "start": 4283.72, + "text": "stabilizing attention for language" + }, + { + "start": 4285.44, + "text": "models, and I think this is now very," + }, + { + "start": 4287.2, + "text": "very standard. Like QK norm is actually" + }, + { + "start": 4289.04, + "text": "a very standard intervention that most" + }, + { + "start": 4291.16, + "text": "of the large models now introduce. Um it" + }, + { + "start": 4294.2, + "text": "doesn't seem to affect performance um" + }, + { + "start": 4296.36, + "text": "from lots of different training runs," + }, + { + "start": 4298.08, + "text": "but it does definitely prevent the kinds" + }, + { + "start": 4300.56, + "text": "of um attention degeneracies. Um and," + }, + { + "start": 4303.56, + "text": "you know, I I'm really" + }, + { + "start": 4304.709, + "text": ">> [laughter]" + }, + { + "start": 4305.24, + "text": ">> the the way that I've seen this is, you" + }, + { + "start": 4307.31, + "text": "the the way that I've seen this is, you" + }, + { + "start": 4307.32, + "text": "know, we have layer norms initially in" + }, + { + "start": 4309.32, + "text": "the pre-norm. Now, we add them after the" + }, + { + "start": 4311.76, + "text": "the" + }, + { + "start": 4312.64, + "text": "nonlinearities in each block, and now" + }, + { + "start": 4314.6, + "text": "we're throwing them in both the Qs and" + }, + { + "start": 4316.28, + "text": "the Ks. And really, I think this is is" + }, + { + "start": 4317.84, + "text": "is kind of getting at" + }, + { + "start": 4320.04, + "text": "the stabilization tricks that people" + }, + { + "start": 4321.68, + "text": "apply to this world. Okay." + }, + { + "start": 4324.08, + "text": "Now," + }, + { + "start": 4325.12, + "text": "um the final set of things that I'll" + }, + { + "start": 4326.84, + "text": "talk about as a stability intervention." + }, + { + "start": 4328.76, + "text": "And frankly, this one is not as popular" + }, + { + "start": 4332.0, + "text": "and more of a of a Google-specific trick" + }, + { + "start": 4334.88, + "text": "that I've seen. Um but uh logit soft" + }, + { + "start": 4338.32, + "text": "capping is a much harder intervention" + }, + { + "start": 4340.4, + "text": "that some people apply. So, this one, um" + }, + { + "start": 4343.12, + "text": "you know, in QK norm, what we're doing" + }, + { + "start": 4345.6, + "text": "is we are controlling the inputs to the" + }, + { + "start": 4347.28, + "text": "soft max and sort of hoping that the" + }, + { + "start": 4349.28, + "text": "outputs are well-behaved. If we really," + }, + { + "start": 4351.52, + "text": "really want to enforce" + }, + { + "start": 4353.28, + "text": "um well-behaved outputs, what we can do" + }, + { + "start": 4356.16, + "text": "is we can kind of take the logits, the" + }, + { + "start": 4357.64, + "text": "things that go straight into the soft" + }, + { + "start": 4359.16, + "text": "max, and we can just cap them off so" + }, + { + "start": 4361.0, + "text": "they can never be too large or too" + }, + { + "start": 4363.04, + "text": "small, right? This is a hard almost a" + }, + { + "start": 4364.68, + "text": "hard constraint. Um it's called a soft" + }, + { + "start": 4366.56, + "text": "cap, of course, but a tan H, you know," + }, + { + "start": 4367.92, + "text": "is bounded at some value." + }, + { + "start": 4370.48, + "text": "Um and so uh this is in the Gemma" + }, + { + "start": 4373.24, + "text": "models. Um I think both Gemma or like" + }, + { + "start": 4375.84, + "text": "Gemma's two, three, and four all use the" + }, + { + "start": 4378.48, + "text": "logit soft cap trick. Um and what they" + }, + { + "start": 4381.2, + "text": "do is they take all of their logits from" + }, + { + "start": 4383.8, + "text": "the attention layers, and then they soft" + }, + { + "start": 4385.76, + "text": "cap them at some value." + }, + { + "start": 4387.88, + "text": "Um some Nvidia folks have done actually" + }, + { + "start": 4390.2, + "text": "quite nice work doing systematic" + }, + { + "start": 4391.88, + "text": "comparisons of these stability" + }, + { + "start": 4393.2, + "text": "interventions. Um and what they find is" + }, + { + "start": 4397.04, + "text": "um if you start with a baseline model," + }, + { + "start": 4399.44, + "text": "you can do all sorts of different" + }, + { + "start": 4400.56, + "text": "interventions, and QK norm is here, and" + }, + { + "start": 4403.84, + "text": "it does slightly better due to the fact" + }, + { + "start": 4405.56, + "text": "that you can crank up the learning rate" + }, + { + "start": 4406.96, + "text": "a little bit. Um but if you do soft" + }, + { + "start": 4409.64, + "text": "capping alone, you actually end up" + }, + { + "start": 4411.56, + "text": "losing performance. So, there is a a" + }, + { + "start": 4413.4, + "text": "quality degradation that happens. This" + }, + { + "start": 4415.04, + "text": "is a very strong intervention. You can" + }, + { + "start": 4416.48, + "text": "never express very confident uh signals" + }, + { + "start": 4419.0, + "text": "in your soft max beyond a certain point." + }, + { + "start": 4421.4, + "text": "Um so, it does have some negative" + }, + { + "start": 4423.0, + "text": "consequences, but this is a very safe" + }, + { + "start": 4424.8, + "text": "way of stabilizing the outputs of your" + }, + { + "start": 4427.48, + "text": "attention. Or sorry, the the inputs to" + }, + { + "start": 4429.4, + "text": "your attention, the logits that go into" + }, + { + "start": 4430.84, + "text": "the soft max." + }, + { + "start": 4433.48, + "text": "So, that's kind of the end of the" + }, + { + "start": 4434.8, + "text": "stability components. Um I can pause for" + }, + { + "start": 4437.4, + "text": "a moment here. Um and I'll talk about" + }, + { + "start": 4439.96, + "text": "sort of various attention things um" + }, + { + "start": 4442.0, + "text": "after that." + }, + { + "start": 4448.84, + "text": "All right. So," + }, + { + "start": 4450.32, + "text": "the last thing I want to talk about" + }, + { + "start": 4451.52, + "text": "today is various interventions that you" + }, + { + "start": 4453.56, + "text": "can make to your attention head." + }, + { + "start": 4455.16, + "text": "Um and as I was saying at the beginning" + }, + { + "start": 4457.0, + "text": "of this lecture," + }, + { + "start": 4458.32, + "text": "um I'm only going to talk about all the" + }, + { + "start": 4460.28, + "text": "things that you can do to sort of dense" + }, + { + "start": 4462.56, + "text": "all by all attention today. So, if you" + }, + { + "start": 4464.28, + "text": "if you were interested in hearing about" + }, + { + "start": 4465.8, + "text": "state space models or linear time" + }, + { + "start": 4467.52, + "text": "attention, um sadly today is not the day" + }, + { + "start": 4469.84, + "text": "for you." + }, + { + "start": 4470.84, + "text": "Um the things that I do want to talk" + }, + { + "start": 4472.84, + "text": "about, which are really commonly" + }, + { + "start": 4474.56, + "text": "implemented um attention interventions" + }, + { + "start": 4477.2, + "text": "today, are uh group query attention," + }, + { + "start": 4480.08, + "text": "which really saves inference cost by" + }, + { + "start": 4481.84, + "text": "reducing the number of heads, um and" + }, + { + "start": 4484.64, + "text": "sparse or sliding window attention, um" + }, + { + "start": 4486.8, + "text": "which really originally came from the" + }, + { + "start": 4488.6, + "text": "GPT-3-ish" + }, + { + "start": 4490.16, + "text": "family, but have now really been adopted" + }, + { + "start": 4492.52, + "text": "widely by most models that are looking" + }, + { + "start": 4495.92, + "text": "to do uh long context unless they're" + }, + { + "start": 4497.6, + "text": "doing exotic uh SSM stuff." + }, + { + "start": 4501.72, + "text": "So, I'll start with um group query" + }, + { + "start": 4503.6, + "text": "attention or GQA or MQA. Um" + }, + { + "start": 4506.4, + "text": "this I'm going to first set up the need" + }, + { + "start": 4509.0, + "text": "for these kinds of things, and then" + }, + { + "start": 4510.8, + "text": "you'll kind of hopefully see what the" + }, + { + "start": 4512.48, + "text": "what the trick is um and why it's fairly" + }, + { + "start": 4514.8, + "text": "natural." + }, + { + "start": 4516.16, + "text": "So, for the moment, we've been talking" + }, + { + "start": 4518.08, + "text": "about, you know, training and modeling" + }, + { + "start": 4519.84, + "text": "and all these things, but like let's" + }, + { + "start": 4521.0, + "text": "take a pause, and now let's think about" + }, + { + "start": 4523.04, + "text": "deployment, right? You train this very" + }, + { + "start": 4525.52, + "text": "big model, and now you need to serve it" + }, + { + "start": 4527.56, + "text": "to lots of users, and you're going to" + }, + { + "start": 4529.28, + "text": "pay a cost for serving." + }, + { + "start": 4531.44, + "text": "And you're going to have to, in abstract" + }, + { + "start": 4533.88, + "text": "sense, pay for two different resources," + }, + { + "start": 4535.92, + "text": "right? You're going to have to pay for" + }, + { + "start": 4536.8, + "text": "your flops, right? The computation that" + }, + { + "start": 4539.24, + "text": "you're performing, but you also have to" + }, + { + "start": 4540.88, + "text": "pay for another thing. You have to pay" + }, + { + "start": 4542.2, + "text": "for your memory accesses, right? Because" + }, + { + "start": 4544.72, + "text": "the memory accesses are also going to" + }, + { + "start": 4546.44, + "text": "impact, you know, your system's" + }, + { + "start": 4547.84, + "text": "characteristics, your latency, your" + }, + { + "start": 4549.4, + "text": "utilization, right? So, you want both of" + }, + { + "start": 4551.48, + "text": "these things to be small." + }, + { + "start": 4553.52, + "text": "Now, let's think about what happens" + }, + { + "start": 4555.4, + "text": "during training or alternatively prefill" + }, + { + "start": 4558.28, + "text": "when you're looking at your prompt where" + }, + { + "start": 4559.68, + "text": "someone gives you the stuff. In this" + }, + { + "start": 4561.64, + "text": "case, you know, the total arithmetic" + }, + { + "start": 4563.72, + "text": "operations you have is, you know, order" + }, + { + "start": 4566.32, + "text": "of magnitude batch size sequence length" + }, + { + "start": 4569.12, + "text": "hidden dim squared, right? That's" + }, + { + "start": 4570.52, + "text": "roughly the size of things that you get." + }, + { + "start": 4572.6, + "text": "And of course, you know, we're doing" + }, + { + "start": 4573.6, + "text": "quadratic attention, so we've got D" + }, + { + "start": 4574.96, + "text": "squared." + }, + { + "start": 4577.6, + "text": "we've got uh" + }, + { + "start": 4578.84, + "text": "total memory accesses. Like what is our" + }, + { + "start": 4580.64, + "text": "memory access that we have here? Um we" + }, + { + "start": 4582.76, + "text": "have batch times sequence length times" + }, + { + "start": 4584.72, + "text": "uh hidden dim plus um" + }, + { + "start": 4587.12, + "text": "the sort of cost of the soft max, which" + }, + { + "start": 4588.8, + "text": "has a N squared component, and then" + }, + { + "start": 4590.32, + "text": "we've got a D squared component um for" + }, + { + "start": 4592.8, + "text": "the for the projections. So, the" + }, + { + "start": 4594.56, + "text": "arithmetic intensity here is pretty" + }, + { + "start": 4596.0, + "text": "good. Um" + }, + { + "start": 4597.48, + "text": "it's going to be one over K. This is the" + }, + { + "start": 4599.0, + "text": "number of heads, so you need to have um" + }, + { + "start": 4601.28, + "text": "uh sorry, head dims. So, your head dims" + }, + { + "start": 4602.84, + "text": "need to be big enough that you're" + }, + { + "start": 4603.76, + "text": "multiplying some reasonably sized" + }, + { + "start": 4605.12, + "text": "matrices. And you've got a one over BN," + }, + { + "start": 4607.88, + "text": "so your sequences need to be long enough" + }, + { + "start": 4609.52, + "text": "or your batch sizes need to be big" + }, + { + "start": 4611.04, + "text": "enough. As long as both of these are" + }, + { + "start": 4612.96, + "text": "true, your GPUs are going to be fully" + }, + { + "start": 4614.92, + "text": "utilized. Great, right? You're You're" + }, + { + "start": 4616.52, + "text": "using all of your resources." + }, + { + "start": 4619.48, + "text": "Now, you know, we have done we have" + }, + { + "start": 4621.76, + "text": "finished training, and now we're serving" + }, + { + "start": 4623.16, + "text": "our users. How do we serve our users?" + }, + { + "start": 4624.96, + "text": "We're going to generate tokens and send" + }, + { + "start": 4626.44, + "text": "it to them, right? Now, for doing that," + }, + { + "start": 4629.16, + "text": "um I can't parallelize the generation" + }, + { + "start": 4631.16, + "text": "process. What I'm going to do is I'm" + }, + { + "start": 4632.56, + "text": "going to generate a token, I'm going to" + }, + { + "start": 4633.88, + "text": "condition on it, I'm going to generate" + }, + { + "start": 4635.04, + "text": "the next token, and I'm going to repeat" + }, + { + "start": 4636.64, + "text": "this process one by one, right? This is" + }, + { + "start": 4638.08, + "text": "just sort of the the curse of" + }, + { + "start": 4639.56, + "text": "autoregressive language modeling. We" + }, + { + "start": 4641.0, + "text": "have to do this." + }, + { + "start": 4642.44, + "text": "Um in order to do this, the efficient" + }, + { + "start": 4644.88, + "text": "way to do it is to maintain all of the" + }, + { + "start": 4647.76, + "text": "sort of past keys and queries that I've" + }, + { + "start": 4649.8, + "text": "had in what's called a KV" + }, + { + "start": 4656.88, + "text": "over the past, and then whenever I need" + }, + { + "start": 4658.92, + "text": "to compute something new, I can reuse" + }, + { + "start": 4661.16, + "text": "sort of the the submatrices that I've" + }, + { + "start": 4663.48, + "text": "already had from the past. And I only" + }, + { + "start": 4665.6, + "text": "really need to compute sort of the new" + }, + { + "start": 4668.4, + "text": "um query key interactions that I need to" + }, + { + "start": 4671.4, + "text": "fill out the rest of this matrix, right?" + }, + { + "start": 4674.48, + "text": "So, every submatrix I've computed" + }, + { + "start": 4676.32, + "text": "before, I can keep. I only need to" + }, + { + "start": 4678.24, + "text": "compute my new ones. So, this saves a" + }, + { + "start": 4679.719, + "text": "lot on compute, right? That's great." + }, + { + "start": 4682.44, + "text": "But," + }, + { + "start": 4683.84, + "text": "the issue here is now our arithmetic" + }, + { + "start": 4686.2, + "text": "intensity is not so good, right? As you" + }, + { + "start": 4688.76, + "text": "might sort of intuit, this KV cache" + }, + { + "start": 4690.96, + "text": "approach is going to be reading and and" + }, + { + "start": 4693.76, + "text": "reading um parameters all the time," + }, + { + "start": 4696.2, + "text": "right? Each time I have a new step," + }, + { + "start": 4698.12, + "text": "right? I'm going to have to read in my" + }, + { + "start": 4699.43, + "text": "I'm going to have to read in my" + }, + { + "start": 4699.44, + "text": "parameters. I'm going to have to take" + }, + { + "start": 4700.64, + "text": "these dot products, and I'm going to do" + }, + { + "start": 4702.36, + "text": "this once every step." + }, + { + "start": 4704.52, + "text": "And so, now what do I have? Well, you" + }, + { + "start": 4706.68, + "text": "know, my total memory Oh, sorry. My" + }, + { + "start": 4708.52, + "text": "total arithmetic operations are the" + }, + { + "start": 4710.0, + "text": "same. I'm multiplying the same matrices" + }, + { + "start": 4712.0, + "text": "still, right? Just incrementally rather" + }, + { + "start": 4713.719, + "text": "than all at once. But because I'm doing" + }, + { + "start": 4716.12, + "text": "this incrementally, you know, now I have" + }, + { + "start": 4719.2, + "text": "um a a memory access pattern of batch by" + }, + { + "start": 4722.2, + "text": "sequence squared by hidden dim plus um" + }, + { + "start": 4725.48, + "text": "sequence by hidden dim squared. And the" + }, + { + "start": 4727.51, + "text": "by hidden dim squared. And the" + }, + { + "start": 4727.52, + "text": "second term is not so pleasant, right?" + }, + { + "start": 4730.36, + "text": "It used to be that it was just D" + }, + { + "start": 4733.4, + "text": "squared, but now we've got N times D" + }, + { + "start": 4736.68, + "text": "squared. And if we compute the" + }, + { + "start": 4737.99, + "text": ". And if we compute the" + }, + { + "start": 4738.0, + "text": "arithmetic intensity, which is the ratio" + }, + { + "start": 4739.64, + "text": "of these two guys, um now we have N over" + }, + { + "start": 4743.4, + "text": "D plus one over B. So, now what we need" + }, + { + "start": 4746.24, + "text": "is large batches plus short sequence" + }, + { + "start": 4748.24, + "text": "length, or we need really big model" + }, + { + "start": 4750.36, + "text": "dimensions. So, if we want to serve a" + }, + { + "start": 4752.16, + "text": "small model efficiently, this is not so" + }, + { + "start": 4756.32, + "text": "Right? Um this is really difficult to" + }, + { + "start": 4758.8, + "text": "deal with, right? This N over D term," + }, + { + "start": 4760.68, + "text": "this first term over here, which is" + }, + { + "start": 4762.64, + "text": "sequence length over hidden dim, is very" + }, + { + "start": 4765.36, + "text": "difficult to reduce if we're doing this" + }, + { + "start": 4767.0, + "text": "incremental computation. This is just a" + }, + { + "start": 4769.36, + "text": "hard thing to deal with." + }, + { + "start": 4772.44, + "text": "So, this leads to this idea of MQA or" + }, + { + "start": 4775.28, + "text": "multi-query attention. Normally, you" + }, + { + "start": 4777.56, + "text": "have multiple heads in your attention" + }, + { + "start": 4779.72, + "text": "operation, and you're going to have" + }, + { + "start": 4781.32, + "text": "different keys, different values, and" + }, + { + "start": 4783.28, + "text": "different queries. That's normally how" + }, + { + "start": 4784.87, + "text": "queries. That's normally how" + }, + { + "start": 4784.88, + "text": "you do things." + }, + { + "start": 4786.16, + "text": "But, one thing that we could do is maybe" + }, + { + "start": 4788.64, + "text": "we can keep the keys and the Vs the same" + }, + { + "start": 4791.12, + "text": "across all the heads, and the only thing" + }, + { + "start": 4793.16, + "text": "that's different across the heads are" + }, + { + "start": 4794.92, + "text": "the queries. If we do this, then this" + }, + { + "start": 4797.92, + "text": "drastically removes the amount of items" + }, + { + "start": 4801.44, + "text": "that need to be moved in and out of" + }, + { + "start": 4802.56, + "text": "memory, right? Because the KV cache," + }, + { + "start": 4804.52, + "text": "right? Are now significantly smaller." + }, + { + "start": 4807.0, + "text": "These are all shared across all the" + }, + { + "start": 4808.56, + "text": "heads. Um this significantly reduces the" + }, + { + "start": 4811.72, + "text": "total memory access as well as the" + }, + { + "start": 4813.68, + "text": "arithmetic intensity, and we're kind of" + }, + { + "start": 4816.2, + "text": "the key term that we were talking about" + }, + { + "start": 4817.76, + "text": "here, we had the N over D term. Now, we" + }, + { + "start": 4820.48, + "text": "have uh H multiplying this, right? And" + }, + { + "start": 4823.48, + "text": "so, this H term allows us to" + }, + { + "start": 4825.88, + "text": "significantly reduce the the I sorry," + }, + { + "start": 4829.48, + "text": "increase the arithmetic intensity if we" + }, + { + "start": 4831.28, + "text": "have a lot of heads, right? This is a" + }, + { + "start": 4832.72, + "text": "significant gain over what we had" + }, + { + "start": 4835.24, + "text": "before." + }, + { + "start": 4836.72, + "text": "So, this gets us significant efficiency" + }, + { + "start": 4839.04, + "text": "improvements," + }, + { + "start": 4840.36, + "text": "but the issue with MQA is" + }, + { + "start": 4844.04, + "text": "this is on the right here, you have one" + }, + { + "start": 4845.72, + "text": "value and one key for all these queries." + }, + { + "start": 4848.72, + "text": "You do in fact lose significant" + }, + { + "start": 4850.44, + "text": "expressive power if you do this." + }, + { + "start": 4853.12, + "text": "And so, there's this trade-off between" + }, + { + "start": 4854.96, + "text": "system's efficiency and expressiveness," + }, + { + "start": 4856.96, + "text": "and you might wonder, is there sort of a" + }, + { + "start": 4858.84, + "text": "sweet spot in which we can avoid trading" + }, + { + "start": 4862.08, + "text": "off, you know, quite a significantly" + }, + { + "start": 4865.44, + "text": "expressive power and computation. And" + }, + { + "start": 4867.52, + "text": "that's where GQA or grouped query" + }, + { + "start": 4869.52, + "text": "attention comes in. You know, the" + }, + { + "start": 4871.04, + "text": "original transformer is multi-head. We" + }, + { + "start": 4872.96, + "text": "have queries and keys for each head. In" + }, + { + "start": 4875.48, + "text": "multi-query, we have one key and value" + }, + { + "start": 4877.84, + "text": "for each for all the heads. In grouped" + }, + { + "start": 4880.28, + "text": "query, we reduce the amount of keys and" + }, + { + "start": 4882.52, + "text": "values, but we keep the number of" + }, + { + "start": 4884.2, + "text": "queries the same. So, we now have this" + }, + { + "start": 4885.6, + "text": "ratio that we can play with, which is" + }, + { + "start": 4887.44, + "text": "kind of the number of key heads or the" + }, + { + "start": 4889.08, + "text": "number of value heads" + }, + { + "start": 4890.92, + "text": "while keeping the total number of heads" + }, + { + "start": 4892.44, + "text": "much larger than that. So, this allows" + }, + { + "start": 4894.16, + "text": "us to very simply control the" + }, + { + "start": 4896.2, + "text": "the trade-off between expressiveness and" + }, + { + "start": 4898.56, + "text": "inference efficiency." + }, + { + "start": 4900.4, + "text": "Um there are other sort of tricks from" + }, + { + "start": 4902.12, + "text": "DeepSeek-V2, multi-head latent" + }, + { + "start": 4904.08, + "text": "attention, that I'll sort of mention" + }, + { + "start": 4906.12, + "text": "briefly next time," + }, + { + "start": 4908.8, + "text": "which sort of have a different kind of" + }, + { + "start": 4910.36, + "text": "factorization structure and a different" + }, + { + "start": 4912.04, + "text": "set of trade-offs. But really," + }, + { + "start": 4914.72, + "text": "the nice thing about GQA is that in" + }, + { + "start": 4917.04, + "text": "practice the trade-off is quite" + }, + { + "start": 4918.56, + "text": "favorable. So, if you have multi-head," + }, + { + "start": 4921.0, + "text": "your performance, this was you know, in" + }, + { + "start": 4922.68, + "text": "the I think T5 days if I remember right." + }, + { + "start": 4925.96, + "text": "This is your downstream model" + }, + { + "start": 4927.8, + "text": "performance. This is your time per" + }, + { + "start": 4929.52, + "text": "sample. You want to reduce this as much" + }, + { + "start": 4931.04, + "text": "as possible." + }, + { + "start": 4932.36, + "text": "With multi-head attention, you have best" + }, + { + "start": 4933.76, + "text": "performance but very high cost." + }, + { + "start": 4936.36, + "text": "With MQA, you know, you have" + }, + { + "start": 4939.16, + "text": "lower cost but much lower performance." + }, + { + "start": 4942.12, + "text": "Similarly, if you make your model" + }, + { + "start": 4943.6, + "text": "smaller to try to hit your performance" + }, + { + "start": 4945.72, + "text": "targets, you get much worse performance." + }, + { + "start": 4947.48, + "text": "GQA really does get the best of both" + }, + { + "start": 4949.4, + "text": "worlds, you know, very low inference" + }, + { + "start": 4951.6, + "text": "cost, nearly the same performance as" + }, + { + "start": 4954.44, + "text": "your full multi-head." + }, + { + "start": 4956.36, + "text": "Um and you see sort of this like GQA" + }, + { + "start": 4959.68, + "text": "group structure where if you have" + }, + { + "start": 4962.48, + "text": "a small reduction in the number of" + }, + { + "start": 4964.0, + "text": "heads," + }, + { + "start": 4965.08, + "text": "you basically have most of the gains in" + }, + { + "start": 4967.56, + "text": "your performance, which allows you to" + }, + { + "start": 4969.04, + "text": "sort of keep most of the expressive" + }, + { + "start": 4970.28, + "text": "power" + }, + { + "start": 4971.48, + "text": "while getting significant inference" + }, + { + "start": 4973.56, + "text": "improvements. And Percy will talk a" + }, + { + "start": 4975.36, + "text": "bunch more about sort of the inference" + }, + { + "start": 4976.76, + "text": "mechanics later," + }, + { + "start": 4978.68, + "text": "but sort of this should give you a" + }, + { + "start": 4979.68, + "text": "flavor of like why models today almost" + }, + { + "start": 4982.28, + "text": "all adopt this GQA structure because it" + }, + { + "start": 4984.92, + "text": "gives you a lot of this inference cost," + }, + { + "start": 4986.48, + "text": "which is really critical, without very" + }, + { + "start": 4988.92, + "text": "much of a" + }, + { + "start": 4991.2, + "text": "expressiveness hit." + }, + { + "start": 4994.44, + "text": "Cool. Any questions for for GQA or KV" + }, + { + "start": 4997.2, + "text": "cache? Yeah." + }, + { + "start": 4999.76, + "text": "Given" + }, + { + "start": 5001.56, + "text": "that you have so many" + }, + { + "start": 5003.8, + "text": "like rules of thumb for what" + }, + { + "start": 5004.96, + "text": "hyper-parameters are good, like to what" + }, + { + "start": 5006.96, + "text": "extent are you still searching over" + }, + { + "start": 5008.28, + "text": "hyper-parameters versus exploiting these" + }, + { + "start": 5011.24, + "text": "rules of thumb that you" + }, + { + "start": 5013.84, + "text": "I think it's a mix of both. I think" + }, + { + "start": 5015.44, + "text": "every sort of model training run has" + }, + { + "start": 5017.32, + "text": "some theses about what can be varied." + }, + { + "start": 5020.08, + "text": "And so, you see this in a lot of the" + }, + { + "start": 5021.32, + "text": "reports where I think the" + }, + { + "start": 5022.8, + "text": "hyper-parameters are often not where" + }, + { + "start": 5024.28, + "text": "people are touching too much." + }, + { + "start": 5026.32, + "text": "But you see like architecture changes" + }, + { + "start": 5027.92, + "text": "like one at a time in a lot of these" + }, + { + "start": 5029.8, + "text": "reports. But it's very rare to like go" + }, + { + "start": 5032.0, + "text": "and change everything up. I think Google" + }, + { + "start": 5033.64, + "text": "is one of the only orgs that seems to" + }, + { + "start": 5035.92, + "text": "like really spice things up in a" + }, + { + "start": 5037.16, + "text": "significant way." + }, + { + "start": 5038.68, + "text": "The Gemma series has done some pretty" + }, + { + "start": 5040.6, + "text": "interesting things." + }, + { + "start": 5042.64, + "text": "The most recent Gemma 4 release, for" + }, + { + "start": 5044.08, + "text": "example, now has like individual" + }, + { + "start": 5046.32, + "text": "embedding for every layer" + }, + { + "start": 5048.08, + "text": "in a way to control the trade-offs" + }, + { + "start": 5049.6, + "text": "between like memory use and flops." + }, + { + "start": 5051.12, + "text": "They're very interesting set of things" + }, + { + "start": 5052.32, + "text": "that they've done." + }, + { + "start": 5054.8, + "text": "Oh, yeah, back there." + }, + { + "start": 5056.68, + "text": "Do you experiment with like data all" + }, + { + "start": 5058.64, + "text": "things of these parameters" + }, + { + "start": 5061.24, + "text": "during training?" + }, + { + "start": 5062.44, + "text": "During training," + }, + { + "start": 5063.64, + "text": "let me think." + }, + { + "start": 5066.4, + "text": "Weight decay, yes. Weight decay, people" + }, + { + "start": 5068.76, + "text": "change in concert with like learning" + }, + { + "start": 5071.08, + "text": "rate." + }, + { + "start": 5072.12, + "text": "That is actually a heuristic that people" + }, + { + "start": 5073.64, + "text": "do that works very well." + }, + { + "start": 5075.76, + "text": "Um other than that, I don't know if" + }, + { + "start": 5078.84, + "text": "there's a lot of different hypers that" + }, + { + "start": 5080.88, + "text": "people change during training," + }, + { + "start": 5082.52, + "text": "especially because the architecture ones" + }, + { + "start": 5084.24, + "text": "just make training incompatible. So, you" + }, + { + "start": 5086.04, + "text": "can't really, you know, change them" + }, + { + "start": 5087.12, + "text": "while you're you're training." + }, + { + "start": 5089.48, + "text": "Yeah, so I think I think weight decay is" + }, + { + "start": 5090.96, + "text": "probably the one that I can think of." + }, + { + "start": 5092.08, + "text": "The others are usually fixed." + }, + { + "start": 5096.28, + "text": "Yeah, MQA is uh" + }, + { + "start": 5099.2, + "text": "it's not just inference time fixed. It's" + }, + { + "start": 5101.28, + "text": "a pleasure to train. That's right, yeah." + }, + { + "start": 5103.04, + "text": "You you you train with a certain number" + }, + { + "start": 5104.72, + "text": "of keys." + }, + { + "start": 5108.36, + "text": "Okay. The last thing I'll talk about is" + }, + { + "start": 5111.0, + "text": "sliding window attention, which is a" + }, + { + "start": 5112.56, + "text": "really old idea. Like, you know, GPT-3" + }, + { + "start": 5115.56, + "text": "used actually this you know, if you read" + }, + { + "start": 5117.72, + "text": "the paper, they'll say we alternate" + }, + { + "start": 5119.28, + "text": "between full attention, which where" + }, + { + "start": 5121.64, + "text": "every position can attend to everyone in" + }, + { + "start": 5123.04, + "text": "the past, and a banded matrix style" + }, + { + "start": 5125.88, + "text": "attention where you can attend to" + }, + { + "start": 5127.36, + "text": "everyone within a fixed window." + }, + { + "start": 5130.28, + "text": "And you know, OpenAI has some early work" + }, + { + "start": 5132.28, + "text": "on these kinds of like different kinds" + }, + { + "start": 5134.08, + "text": "of attention patterns that you can use." + }, + { + "start": 5136.68, + "text": "But actually, this has become really" + }, + { + "start": 5138.28, + "text": "really popular over the past year." + }, + { + "start": 5141.56, + "text": "This idea of alternating, you know, the" + }, + { + "start": 5144.4, + "text": "big full attention and a more local" + }, + { + "start": 5147.16, + "text": "attention actually hits a sweet spot for" + }, + { + "start": 5150.04, + "text": "how to manage like long context" + }, + { + "start": 5151.92, + "text": "performance while not paying too much" + }, + { + "start": 5154.48, + "text": "for inference." + }, + { + "start": 5156.32, + "text": "Um I think, you know, the more recent" + }, + { + "start": 5159.6, + "text": "revival in open models, I would maybe" + }, + { + "start": 5161.56, + "text": "say Cohere Command A was the first one I" + }, + { + "start": 5164.12, + "text": "saw do it, where they, you know, had" + }, + { + "start": 5167.0, + "text": "this like structure where every four" + }, + { + "start": 5169.36, + "text": "layers they would have a full attention" + }, + { + "start": 5170.8, + "text": "that attended to everything. The three" + }, + { + "start": 5172.84, + "text": "layers in between would use a sliding" + }, + { + "start": 5174.44, + "text": "window attention that would only be able" + }, + { + "start": 5176.36, + "text": "to look at local structure. And of" + }, + { + "start": 5177.72, + "text": "course, you know, as you go up sorry, in" + }, + { + "start": 5180.48, + "text": "this case down cuz they ordered the the" + }, + { + "start": 5182.72, + "text": "diagram the other way. As you go down" + }, + { + "start": 5184.56, + "text": "sort of these blocks, you know, you're" + }, + { + "start": 5186.4, + "text": "aggregating local information into" + }, + { + "start": 5187.8, + "text": "global ones. The local attentions at the" + }, + { + "start": 5189.68, + "text": "end can of course access more global" + }, + { + "start": 5191.44, + "text": "information, but this, you know, allows" + }, + { + "start": 5193.36, + "text": "you to manage the" + }, + { + "start": 5195.76, + "text": "the the cost of having a really long" + }, + { + "start": 5198.56, + "text": "context without having to go for" + }, + { + "start": 5200.56, + "text": "something like a state-space model or" + }, + { + "start": 5202.88, + "text": "more exotic intervention. And that's" + }, + { + "start": 5204.4, + "text": "worked quite well." + }, + { + "start": 5206.0, + "text": "Um there's also some innovation where" + }, + { + "start": 5209.08, + "text": "people change the embedding format for" + }, + { + "start": 5211.68, + "text": "the long range information where they" + }, + { + "start": 5213.12, + "text": "get rid of things like rope, so you have" + }, + { + "start": 5215.36, + "text": "no position embeddings at all. So," + }, + { + "start": 5217.12, + "text": "you're really looking almost at bags" + }, + { + "start": 5219.56, + "text": "where the short range information still" + }, + { + "start": 5221.24, + "text": "gets position information. So, people do" + }, + { + "start": 5223.52, + "text": "all sorts of you know," + }, + { + "start": 5225.56, + "text": "kinds of interventions involving these" + }, + { + "start": 5227.6, + "text": "these" + }, + { + "start": 5228.4, + "text": "both the embeddings and alternating" + }, + { + "start": 5230.56, + "text": "local and global structure." + }, + { + "start": 5232.84, + "text": "Um I'll say that this is a" + }, + { + "start": 5234.8, + "text": "you know, attention and in general how" + }, + { + "start": 5236.36, + "text": "to manage the trade-off between long" + }, + { + "start": 5238.6, + "text": "context and sort of long context cost" + }, + { + "start": 5241.24, + "text": "and performance is a still an active" + }, + { + "start": 5243.4, + "text": "area of investigation. It's a place" + }, + { + "start": 5244.88, + "text": "where the most architecture work and" + }, + { + "start": 5247.2, + "text": "changes are still being done. Um we see" + }, + { + "start": 5250.24, + "text": "essentially a bunch of other models" + }, + { + "start": 5251.8, + "text": "adopt this idea. Llama 4, most recently" + }, + { + "start": 5254.24, + "text": "Gemma 4, Omo 3, they all do this" + }, + { + "start": 5257.12, + "text": "combination of sliding window attention" + }, + { + "start": 5258.84, + "text": "and full attention, in their case using" + }, + { + "start": 5260.92, + "text": "full rope instead of nope as the" + }, + { + "start": 5264.04, + "text": "embedding." + }, + { + "start": 5265.4, + "text": "So, as I said, this is becoming really" + }, + { + "start": 5267.32, + "text": "really popular." + }, + { + "start": 5268.92, + "text": "Qwen 3.5, which I put on the right," + }, + { + "start": 5271.16, + "text": "they're actually a little bit different" + }, + { + "start": 5272.44, + "text": "because they alternate a state-space" + }, + { + "start": 5274.44, + "text": "model" + }, + { + "start": 5275.68, + "text": "called a gated DeltaNet and a full" + }, + { + "start": 5277.92, + "text": "attention every sort of, you know, one" + }, + { + "start": 5281.52, + "text": "one full attention every four layer" + }, + { + "start": 5282.88, + "text": "every four layers." + }, + { + "start": 5284.6, + "text": "So, it's the same alternating structure," + }, + { + "start": 5286.44, + "text": "but they're using a different sort of" + }, + { + "start": 5288.0, + "text": "cheap layer. In their case, they're" + }, + { + "start": 5289.28, + "text": "using a state-space model. I'll explain" + }, + { + "start": 5290.96, + "text": "what that is next lecture instead of a" + }, + { + "start": 5293.32, + "text": "sliding window sort of local attention." + }, + { + "start": 5295.76, + "text": "But you see this is like I think a new" + }, + { + "start": 5297.44, + "text": "theme over the past year where, you" + }, + { + "start": 5299.64, + "text": "know, open models are really trying to" + }, + { + "start": 5301.12, + "text": "grapple with long context performance," + }, + { + "start": 5303.24, + "text": "and the way to do that, at least so far," + }, + { + "start": 5305.64, + "text": "is to have these hybrid models that" + }, + { + "start": 5307.64, + "text": "aren't just global attention, aren't" + }, + { + "start": 5309.36, + "text": "just cheap attention. They're some sort" + }, + { + "start": 5310.719, + "text": "of mix in between. And that's that seems" + }, + { + "start": 5312.84, + "text": "to have worked very well so far in a lot" + }, + { + "start": 5315.0, + "text": "of these models." + }, + { + "start": 5317.76, + "text": "Okay, cool. So, as I was sort of trying" + }, + { + "start": 5320.28, + "text": "to emphasize, when you look across all" + }, + { + "start": 5322.56, + "text": "of these models, you start to see a lot" + }, + { + "start": 5324.16, + "text": "of patterns and hopefully a sense of" + }, + { + "start": 5326.08, + "text": "general understanding about what things" + }, + { + "start": 5328.28, + "text": "you can do and what things are good to" + }, + { + "start": 5329.84, + "text": "folks. Um we also see a lot of" + }, + { + "start": 5332.2, + "text": "differences in how we handle context and" + }, + { + "start": 5334.32, + "text": "how we handle position embeddings." + }, + { + "start": 5336.56, + "text": "Even tokenization, there's some" + }, + { + "start": 5337.719, + "text": "differences, right? So, there are" + }, + { + "start": 5339.32, + "text": "differences across these models, but" + }, + { + "start": 5340.63, + "text": "across these models, but" + }, + { + "start": 5340.64, + "text": "there's also commonalities that" + }, + { + "start": 5341.88, + "text": "hopefully now give you some intuition as" + }, + { + "start": 5343.88, + "text": "you go out and do your assignments and" + }, + { + "start": 5345.36, + "text": "and sort of mess with the leaderboard" + }, + { + "start": 5347.08, + "text": "and so on." + }, + { + "start": 5348.15, + "text": "on." + }, + { + "start": 5348.16, + "text": "Thanks." + } + ] +} \ No newline at end of file diff --git a/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript_clean.txt b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript_clean.txt new file mode 100644 index 00000000..21fcad6f --- /dev/null +++ b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/transcript_clean.txt @@ -0,0 +1,2626 @@ +So today we're going to talk about +architecture, which at least to me has +always been pretty inscrutable. +Um and so I'm going to take the approach +of just telling you kind of everything, +right? I'm going to go through all of +the modern papers. +Um and we're going to just look through +what has everyone done? Um and so I've +titled this everything you didn't want +to know about architectures and +hyperparameters because I think we all +wished we lived in a world where the +only things you had to know were like VC +dimension or something, right? Like very +simple, you know, theoretical tools, but +that's not really where we are. +So okay. What we are going to do is we +are going to try to understand +architecture from kind of like a survey +lens, right? The best thing to do, you +know, better than listening to this +lecture even is for you to go out and +like train your own models and try +different architectures, right? That's +by far the best thing to do. That's part +of the philosophy of the course. But +we're not going to be able to cover the +whole design space of all the different +architectures that are out there, right? +Like that's not something that we have +the compute or the time to do. +So my opinion is the second best thing +that we could do is to try to learn from +the experience of others, right? What +has What has everyone else done? What +are the choices that they are making, +right? And by looking at kind of a +broader, somewhat zoomed out picture, +maybe we can start to understand, oh, +these are the kinds of parameters and +choices that are sort of fixed across +all effective architectures and these +other ones can be sort of varied without +impacting +how the model performs, right? So I'm +going to talk about, you know, basically +transformer variants. Like what is the, +you know, modern transformer starting +with, you know, the Vaswani paper and +then, you know, as we go to more modern, +more recent architectures, what do they +have in common? And then what are we +allowed to vary? Or not allowed, but +what do people vary as they go through +this, right? +So +I think many of you have taken an NLP +course of some kind or at least seen a +transformer, so you've probably seen, +you know, the very vanilla transformer +from Vaswani et al. Um you know, there +there are some fairly standard choices +that you make. You say, oh, transformers +don't have positional dependence, so +we're going to add a position embedding. +And what do we do? We're going to add +some sines and cosines. +Um we're going to have information +processing through a ReLU. Um and then +we're going to have a a post norm. I'll +talk about what exactly that is later. +Um and when you look at your assignment, +your A1, you're going to notice some +differences between the standard or the +vanilla transformer and what we've asked +you to implement. Well, we're going to +ask you to move the layer norm to the +front of each transformer block or the +non-residual layers. We're going to ask +you to implement something called rope. +Um +and we're going to ask you to implement +something called SwiGLU and not ReLU. +Right? Why do we pick these? Um one +reason is we've, you know, copied a lot +of this over from LLaMA, but so did +everyone else. Really, I think if you +were to train on your own language +model, I think you'll quickly run into +this question of, oh, there's so many +choices, right? Like what do I choose +for all these things? +And so let's now sort of walk through +all these different models. +The way I kind of think about +architectures is to think about to look +at all the different things people have +done and say, what are the things that +people have done? Can we pick and choose +from those? +Um Percy always makes fun of me for this +a little bit, but you know, I try to +look at all the the different models +that come out each year to try to make +this lecture. +Um and last year I thought, oh, there's +just a couple papers. It's going to be +fine. It's going to be fine. And then I +look through all the things and there's +a lot of papers. There's Qwen 2 and +Gemma 3 +and InternLM2. +and then there were even more. There's +like NeMo Tron 4 and Qwen 2 and oh oh my +goodness, there were 19 new dense +models. And so last year I had my work +cut out for me. +And then this year, you know, I thought, +well, there can't be that many new LM +releases. Like it's got to be slowing +down, right? Like people can't keep +training 20 dense LMs per year. Um and +that's technically right. There aren't +that as many dense LMs. Initially, you +know, I was like, oh, there's Qwen 3, +Gemma 4 just came out last Thursday, so +I put that in there. And almost 3. You +know, there's only a couple. And of +course I have to give a shout-out to +Percy's own 8B model trained with +Marine. And I was like, oh, we'll just +have a few things to cover. Um but it +turns out if you start looking, there's +a lot of different models. Um and so the +fact that we have so many different +models, most of these actually are MoEs, +mixtures of experts, and I'll be talking +about that tomorrow rather than today. +Um because we have such a big diversity +of models, we actually get a pretty good +picture of all the different choices +that we can make. Um +so I I made this like little table. +We'll come back to this little table at +the end of the lecture. +Um but basically at this point, you +know, starting with, you know, the +original transformer, there's been +actually quite a few autoregressive +language models kind of trained on the +same class of things. +Um and you can ask questions like, what +are the different vocabulary sizes? Or +what kind of layer norms do we use? Or, +you know, what kind of position +embeddings do people use? And we see +some fairly clear trends. I'll be +talking about this as we go. +Okay. +Um so +the goal here is that we're going to +cover couple different things. We're +going to cover common architecture +variations. So these are different +building blocks of the transformer. +Um and after we've established what the +standard building blocks are, like, you +know, what do we use for the the +nonlinearities or what do we use for +position embeddings, then we're going to +talk about hyperparameters. We're going +to go down even lower detail and say +like, you know, what is FF dim? Um +should we make that a multiple of four +or like multiply the the hidden by four +to get FF dim? How many vocab elements +should I have? Um +and then after that, we're going to talk +about very low-level tricks of how to +get models to train stably. And the +reason why I'm going to talk about that +in this lecture is because these +stability tricks have a pretty close +connection with the architecture +variation, right? Um one of the things +that, you know, higher level I want to +sort of impress upon you is that +architectures are actually a a very +complex set of tradeoffs, right? Like +what does a architecture have to do? +Well, it has to learn from data, so it +has to generalize. It has to train +efficiently on GPUs. And it has to not +blow up, right? Like halfway through +training, if your, you know, training +losses just go like down like this and +then suddenly blow up, that's no good at +all, right? So all these different +requirements end up getting baked +straight into the architecture. And +that's why these things are a little bit +messy and a little bit complex. +Um but you should keep that in mind and +that's why, you know, things aren't in +many ways not so elegant. +So we're going to start with the core +architecture piece. And as a high-level +view, you know, I think the the way that +I see a lot of the architecture stuff, +you know, looking basically +historically, is kind of in the early +days of, you know, starting with the +transformer until, you know, GPT-3 or +so, there's a lot of experimentation +that happens. People try lots of +different things. There's no like gold +standard that everyone has unified on. +And then, you know, LLaMA 2 comes out +and everyone's like, wow, LLaMA 2 is +great. I want my own LLaMA 2. And so +everyone starts training LLaMA 2-alikes +with, you know, minor variation +that people have. And then finally, you +know, last year we saw really big +differences or or sort of a trend +towards architecture modifications that +make training more stable. And this year +we see lots of trends towards +architecture variations that enable +longer context dependence. So there are +these big themes that are happening, but +really I think, you know, you see this +like big point when LLaMA 2 comes out +and everyone's like, wow, I want to +train something with that. And then +suddenly, or not suddenly, but after +that, people are starting to explore +once again. So it's kind of cool to see +all these different changes. +Um I think people can disagree about a +lot of things on architectures, but +there is one thing that everyone agrees +on, you know, like if you take the +transformer paper, I think a lot of +people will say like the transformer +people got like most of the things +got like most of the things +right, except this. And the thing that +they really did not get right or like I +think most people agree they did not get +right is where you put the layer norm, +right? So in the original um +? So in the original um +uh transformer paper, you know, the +layer norm goes in what you would call +the residual path, right? So in the +transformer, you know, you have the +residual stream, this X that kind of +runs through the whole network. And +sort of a delta back into this residual +stream. +And then in order to make sure that +these gradients are sort of stable +across layers, you know, a layer norm is +placed at the end of each of these +components. +Now, um +instead of putting the layer norms in +the residual stream, there's an +alternative. Um I'll refer to this as +pre-norm, +um in which you can put the layer norm +outside of the residual stream, but +before each of the computations. So you +can put it before the multi-head +attention, you can put it before the +FFN, right? Um we'll call this pre-norm. +the nomenclature will get a little bit +confusing. Um You can call this uh +post-norm for now, but let's call this +sort of residual norm, right? Cuz you're +putting the norm in the residual layer. +Um basically all modern language models +uh push the layer norm outside of the +residual stream. This is just like a +thing that basically everybody does. +Um there is one funny exception, but it +is OPT-350M. +And if you all are familiar with sort of +language models, we kind of know OPT in +general was, you know, kind of a mess of +a language model, right? And OPT-350M +um is even more so because I don't know +why only that model uh has a post layer +norm in the residual stream. +Okay. So this is one of the things that +like everyone agrees on. And so you +might wonder like why is this like such +a, you know, +uh like a uni- unified thing across all +the different models? +Um and if you look at some of the early +works studying like where do you place +the layer norm style research, um what +you really see is that, you know, the +early motivation for a lot of this was +when you train a transformer, you need +to do a warm-up. Actually, you know, +modern transformer training still does +warm-ups as well. But you definitely +need to do warm-up when you train. +Now, wouldn't it be nice if we could +remove the warm-up, right? So, that was +kind of the initial motivation for a lot +of this research. +But people quickly realized that +removing the warm-up had very serious +issues in terms of the stability and +convergence +of these things, right? So, if you did +post norm plus layer norm, which is, you +know, basically the original transformer +thing, you got this purple dash line. +Oh, you just don't converge as well +compared to doing something like pre +norm. You can ignore the other terms. +You would get much nicer convergence +even without warm-up, right? So, this +was the original motivation. +But really what people kind of quickly +realized is that, you know, moving the +layer norms outside the residual stream +has some pretty important implications +as you make your network deeper and as +you start to grapple with stability +issues. +To me, I think the gradient attenuation +issues are kind of the most clear. +When you sort of talk to people who do +architecture design, I'm not really one +of the the people that deeply engages in +this. But one of the things that people +often say is keep your residual stream +clean, right? So, in this case, you have +your X's coming in from the bottom on +the pre norm side, and this X propagates +all the way up to the top, right? All +the way up to your your final output. +And that allows gradients to propagate +if you in the backward pass straight +through this, right? +Um that makes gradient propagation very +simple, +which improves both stability and signal +propagation. And that's sort of what +people realized very, very quickly, that +if you do something like pre norm in +blue +initialization, sort of the gradient +size is kind of remains the same, right? +Because you have this nice straight +through propagation in the backward +pass. On the other hand, if you have +post layer norm, you have these kind of +complicated effects that happen because +you're layer norming each time you're +going through a transformer block. And +that's going to change the norm of your +gradients as you go backwards through. +So, you can kind of see, +you know, from the principle of keep +your residual stream clean, that pre +norm makes a lot of sense. +People also realized through +experimentation that this also improves +stability in general, that the sizes and +frequencies of gradient spikes +were improved under pre norm compared to +post norm. And, you know, this is a +figure from Salazar and UN, who were one +of the first ones, I think, to to study +this phenomena carefully. +I think this is the reason why it stuck +around, right? Stability and the ability +to go deep are both very, very important +for modern large language models. And +so, this idea of moving your layer norm +outside of the residual stream is one +that basically everyone has adopted. +Um so now, you know, if putting layer +norms in residual streams is bad, +why does layer norm have to be at the +start? Of course, we have pre norm, +which is, you know, before our +computation, but we could have it after +computation as well, right? That's +as well, right? That's +equally good at least under that +knowledge logic. +Um and that's exactly right. Many recent +models like Grok or Gemma 2 or Olmo 2 +have the structure where they moved the +layer norm after the computation. So, +it's a post norm of a kind, but it's +outside the residual stream. +Other models still actually just put +layer norms everywhere. They put a layer +norm here, they put a layer norm after. +I'll get to this later as we talk about +stability, but one of the other lessons +that it seems to have held up very well +is if you have stability issues, you can +kind of sprinkle in layer norms +everywhere, and that will generally +improve stability. +It's almost very strange to be saying +this cuz it's so ridiculous. And yet, +that statement has actually been proven +right. Every time, you know, people have +encountered stability issues, they say, +"Oh, but what if we just throw a layer +norm into attention?" Turns out that +works, too. We'll get to that later. So, +okay, that's post norm +or double norm in this case where you +have two layer norms here. +Okay. The other thing that you can do is +in the original transformer, you have +the layer norm, which is this operation +right here. So, you have your +activations X. +You're going to mean subtract, divide +the variance, and then scale it back up, +right? And this works just fine, right? +It's not like this is wrong. And many +models have successfully trained on this +scheme. +But basically, most or all modern +models, I think, +use RMS norm, which doesn't subtract the +mean or add a bias term, right? So, it's +just a scaling down and scaling back up, +right? So, you can see this in the +equation here. +Um and really, layer norm is more +expressive than RMS norm. So, there's +really representationally no reason why +you have to use RMS norm. +But RMS norm is nice because in +practice, +there's really no expressiveness loss. +RMS norm models just as well as layer +norm. But more importantly, it is, you +know, faster, right? This is the part +where kind of the systems and sort of +architecture co-design starts to come +in. +Percy mentioned, you know, in the +previous lecture, this idea of +arithmetic intensity, right? We want to +keep our GPUs hot by doing, you know, +matrix multiplies and other very intense +computations. We do not want to be +wasting our GPUs by having them move +little tiny bits of memory back and +forth, right? That's a very inefficient +use of our, you know, very powerful GPU. +And so, what we want is to remove +operations that are small and involve +memory movement, but don't give us much +expressive power, right? So, by that +view, what we really want to be doing +here is, you know, if the mean +subtraction and addition isn't really +doing much for us, just get rid of it, +right? Um +you might think, "Okay, why does this +matter? We're just optimizing this +teeny, tiny operation that accounts for, +you know, in this case, something like +0.17% +of the total floating point operations +of our system." +But, you know, as Percy mentioned, it's +not really about the flops, right? The +flops are the the floating point +operations we do, that's sort of +multiplying matrices, but that's not +runtime, right? Runtime is a much more +complicated object. And, you know, +statistical normalizations, things like +layer norms, even though they're only +0.17% of the flops, depending on your +workload and depending on the setup, can +be up to 25% of the runtime, right? +That's kind of crazy. On tiny models, +this can be really, really big because +you're still having to move all these +parameters back and forth from fast to +slow memory and vice versa +when you're doing these operations. So, +data movement is really, really +important, and RMS norm can still matter +a lot because of this, right? So, +um +you can see kind of the difference here. +The arithmetic intensity is in white, +and then you can kind of see the flops +involved in black. And you see that +layer norm has a very low arithmetic +intensity, which is the operation we try +to want to remove as much as possible. +Yeah, question over there. +Data movement for normalization is so +disproportionate compared to arithmetic +contraction +So, for a ten- Something like tensor +contraction, which is, in this case, +matrix multiplies, +the majority of the workload is, you +know, multiplying. Whereas for stat +normalization, the majority of the +workload is memory movement. And memory +movement is quite slow. So, imagine the +case where like moving something is like +almost all of the compute, then you're +still paying quite a bit here, right? +Cuz activations can be quite large. +Yeah, I think the percent runtime in +this case is quite extreme. This is like +in the, you know, tiny models with like +matrices that don't really generally +make sense in modern workloads, but this +is, you know, giving you a sense of why +this is a free optimization. +Um and you do see this, right? This is +another paper in which people were +evaluating different architecture +interventions. They're on Get All in +2020. +I think this was a Google paper, and +they show, you know, for teeny, tiny +transformer of a 200 million parameters, +you got more steps per second. That's +the third column over here when you +switch to RMS norm. And in fact, you +actually get better performance, which I +don't think is something that you're +guaranteed, but it's a nice bonus +regardless, right? So, you got a free +systems win by just moving to RMS norm. +And so, basically, everyone has has +decided to move over to this now. +And in general, +there's a more general version of this. +There's no guarantee to any of the +things I'm saying, but bias terms in +transformers and neural networks +are generally not that useful. So, in +the original transformer, the linear +terms all have biases, +but most implementations actually just +drop the biases entirely, right? Once +again, this is another example of +something that's not very arithmetically +intense, but fairly memory +intensive, relatively speaking. And so, +ive, relatively speaking. And so, +you might as well just drop these, +right? And get the free systems win. +There's also some cases, I'll just +mention this offhand, where the bias +terms can also induce stability issues. +So, they're useful in other ways, but +really, I think the primary reason these +are dropped is just to simplify things +from the systems perspective. +Cool. +Okay. So, I think layer norms, the story +is pretty easy. +Um it's easy in the sense that what +people do is fairly standardized. Our +our understanding, not at like a deep +theoretical level, but our understanding +of like what layer norm does is fairly +good, right? Everyone moves the layer +norm outside the residual stream, +often pre norm, but I think this might +partially be be because Llama 2 did +that. Um and we roughly have a sense of +how to use layer norm to control things +like +gradient spikes +and keep signal propagation nice. +Related to that, we also now, you know, +basically always use RMS norm, and you +hopefully understand the general +principles here +of basically just dropping bias terms. +And that allows us to to keep our system +more arithmetically intense +while keeping the expressive power the +same. +Um I think the unsatisfying thing about +a lot of architectures is that, you +know, you can't really reason about this +beforehand, right? Like we don't know +beforehand that dropping the bias terms +that dropping the bias terms +is okay, but from a lot of +experimentation and now collectively +acquired knowledge, we roughly know that +dropping the bias terms on both the +linear and RMS norm is okay for typical +language modeling workloads, right? Um +this is the kind of statement that we +can make on the basis of of what we do +when we look at a variety of different +models. +Okay, any questions uh for layer norm +stuff? +Good. Okay. So, now I'm going to talk +about activations. Um +and there's a whole zoo of activations. +There's just a lot, right? Like ReLU, +GELU, Swish ELU, GeGLU, +SeLU, SwiGLU, LiGLU. +Um and what are these things? +Um I think at what point one point of +my, you know, more stats ML training, I +thought to myself I will never learn +these things. I will make it a point of +pride to never know what a SwiGLU is. Um +but now it's actually very important for +us to to actually like have a general +sense of what these objects are um and +which parts of these names actually kind +of matter for performance, right? +Um So, you can build and train a +language model on just a fairly vanilla +activation. Um +even, you know, I guess Chinchilla is +probably the best model out of that +group, but even if you just want a ReLU, +you know, you can train a reasonably +performant language model using just +that activation. There's nothing wrong +with that, right? And if we move to +GELU, which is a Gaussian error unit, +and really the only difference is this +tiny divot at the bottom here, which +really, you know, for the most of the +activation doesn't change anything, but +changes the gradients right near zero, +um then you can train models like GPT-3, +right? That's a perfectly good large +language model, not, you know, modern by +modern standards, but perfectly fine. +Um but then, you know, we get to the +gated linear units like SwiGLU and +GeGLU, and these are really where most +of the action is. You know, this is very +similar to layer norm in that I think +almost all credible modern language +models use a gated linear unit of some +kind. +Okay, so what is a gated linear unit? +So, these are gated activations. So, if +we want to look at something like a feed +forward layer, um we can just look at +this first part. This is, you know, a +very standard ReLU feed forward, right? +I have my X, I hit it with a W1, you +know, I I entry-wise threshold at zero, +and then I hit it with another W2, I get +my output, right? Very straightforward +ReLU network. +Another thing, um I don't say this as my +personal experience, but another thing +that is often said in architecture +design is that gating is often very +helpful. So, if you apply that very +general heuristic, what you might get is +to say, "Okay, well, instead of just +having, you know, this entry-wise ReLU, +why don't we also have a gate? And the +second gate, the second term here, this +is just going to multiply the output of +my ReLU entry-wise, and I have a second +matrix V, okay?" +Now, this is just going to modulate the +output of my ReLU. +Um and then I'm going to do everything +else the same, right? So, instead of, +you know, just having XW1W2, I have XW1, +and I'm going to gate that with XV. This +is another +uh activation the same size as this, and +then I'm going to, you know, down +project it back with W2. +Okay, so what is this? Now, this is a +RegLU. This is a you you make these +names by adding the first activation, in +this case ReLU, and GLU, right? So, the +ReLU gated linear unit. +Um and gating has been a, you know, very +effective other primitive in +architecture design, and it turns out +that this is very effective in language +modeling as well. +So, +um if you take something like a GELU, +we've already talked about that, right? +That's like the ReLU, but with a little +divot at the bottom here, um you will +get a GeGLU. Um and if you take a +SwiGLU, which is X times a sigmoid, then +you will get a SwiGLU. So, this is a +Swish times, you know, the rest of it. +Um and this really covers a lot of the +modern models, right? Um generally the +Google folks have used GeGLU, so like +the Gemma models, the T5 models are +those. Um and everything that's kind of +like a LLaMA descendant uses a SwiGLU. +Um so, PaLM and the LLaMA descendants +are all kind of SwiGLU models. Um +I would say that SwiGLU is probably the +more dominant one, but honestly amongst +the gated units, doesn't really matter. +Now, here's a side note that will uh be +a semi-important piece of trivia later. +Um if you look up here, right? Um you +will notice that there are more +parameters for the gated uh model, +right? Cuz I have this parameter of V. +And so, if you do a little bit of math, +right? I now have three matrices instead +of two matrices, +right? What you should do is you should +maybe use a smaller feed forward +dimension by a factor of 2/3 in order to +keep the total parameter count the same, +right? So, this is roughly the idea of, +"Well, I want to keep the same number of +total parameters as my original MLP, but +I now want to make it gated, so I'm +going to make the feed forward +dimension, which is the output dimension +of this W, a little bit smaller by 2/3, +right?" So, this is a general rule of +thumb that people have followed, but +it's not really an iron rule. +You know, the original Noam Shazeer +paper that, you know, proposed this, +had some, you know, very small deltas +originally, but they're consistent +deltas, and I think to his credit, um +I think a lot of his papers have these +like error bar assessments of like +training multiple replicates and +checking to see if they're better. Um +and if you look, the GLU variants are +almost always consistently better than +the non-GLU variants. And this is a +parameter matched comparison because um +Noam Shazeer is always doing this 2/3 +adjustment to make sure that all of the +models have the total same total number +um of parameters. +So, this is quite nice. It's in some +ways a free win. Um almost everyone uses +a GLU. There have been other sort of +more controlled systematic comparisons. +This is uh the same paper I was talking +about before, Noam et al. in 2020. +Um Google actually in the 2020s did +quite a few nice large-scale +architecture comparison papers, um +although with a T5 architecture and not +uh autoregressive uh language model. Um +and they, you know, basically +comprehensively compare things like +GLUs, and you see once again, um if we +look at the SwiGLU or the GeGLU or the +GLUs in general, they do significantly +better at loss or the other downstream +metrics, right? +Fairly compelling on paper uh on these +papers, also clear from now a lot of +model training runs that SwiGLU and GLU +are good, right? +So, there's a lot of variations in +gating, but really the important single +axis to know is that gating uh for these +nonlinearities is actually quite +important, gives you +uh nice boost without much of a +computational cost. Um you know, that's +not to say that gated linear units are +necessary. I mean, GPT-3 was that. Um I +think the NeMo Tron 340B model used a +squared ReLU, which is a kind of a crazy +choice, but that works, too. Um both of +these models are perfectly performant, +but it's actually quite rare to see +anything that's not trained on a gated +linear unit, right? So, evidence is +pointing towards consistent gains on +using these gating tricks. +So, those are I think the the more +consensus choices for things that we can +do in architecture. Um now, this one I +think is a really fun idea, but one that +I think now the test of time has shown +maybe is not quite as good or maybe not +as popular of an idea. +Um normally, we do our transformer +blocks serially, right? We compute our +attention, then we compute the MLP, +right? One after the other. +Um if you're very systems-minded, you +might say, "Well, this introduces a +bottleneck, right? I have to wait for +the computation of one to do the other. +If they were instead in parallel, I +could bring to bear some new and cool +systems optimizations, potentially, +right?" So, you might ask, "Could we +parallelize the transformer block?" +And um this was originally an idea that +was in GPT-J, which is the open-source +attempted replication of GPT-3. +Um and kind of very interestingly, I +think GPT-J has been surprisingly +influential in sort of propagating a lot +of ideas. I mean, PaLM as well. Google +um is actually surprisingly bold with +the architectures that they do. Um but +the description in uh PaLM, uh which you +can see in their report, is kind of the +following. Instead of nesting this, +which is the sequential format at the +top, you know, you're just going to add +together the output of the MLP and +attention layer, and just add both of +those back into the residual stream. +Um if you implement this right, you can +actually share a lot of the components. +Like, you can share the layer norms, you +can fuse the matrix multiplies. This +allows you to potentially get additional +systems optimizations. Um +And I think a lot of the people that +have been influenced by Google, so +Cohere, you know, was founded from one +of the former +uh transformer authors, they do a lot of +Google-inspired optimizations. They +followed kind of this architecture. Um +but not very many others. Um this has +been a approach that has really fallen +out of popularity over the past, I +think, 2 years. Um I think mainly +because optimization of the serial form +has gotten sufficiently good that the +systems gains from the second one just +isn't worth the small hits to uh +representation power that you end up +getting going from uh parallel to +serial. +Effectively, you can think about it as +you've lost half of your depth, right? +And that can be +uh a deleterious +uh thing to do to your model. +So, in terms of the architecture things, +actually, you know, the fact that this +is so short should kind of suggest to +you how much the original transformer +formulation has somewhat stood the test +of time, right? Cuz the only thing I'm +really talking about changing here +um is, you know, where the norms go, or +you know, whether we have bias terms, or +whether we gate the MLPs, but those are +actually pretty minor changes compared +to all the things that you can do. +Now, uh, those of you that are, you +know, sort of carefully paying attention +might say, but wait, you know, there's a +lot of transformer alternatives that +change the attention. Um, yes, you'll +have to wait until next lecture because +today I'm just only going to cover sort +of core attention based methods. Um, and +next lecture I'll throw in a little bit +of a state space model stuff, but as +long as you're in this like dense +attention land, actually the +architecture from the original +transformer paper is pretty close to +what we do. +So, you see, uh, quite a bit of this, +right? So, uh, just now going back to +this, blue here is RMS norm block as +layer norm. You see most of the modern +models are sort of RMS norm models. +Serial versus parallel layers, the blue +one's parallel, the rest is serial. You +see mostly serial layers. Um, pre-norm +versus post-norm. Some of these, uh, +ones that I marked as post-norm are +actually pre and post-norm. +Um, and then these ones on the right, +these are GLUs, uh, almost always with +the exception of things like, uh, +Falcon, which use a gated linear unit, +but almost all of these are really, uh, +gated linear units for modern models. +So, you can see the trends quite +visually, um, +from what I'm telling you. +Okay. So, really the thing that is very +different across implementations, and I +think a place where a lot of the +architecture stuff is still in flux, is +how you do kind of position dependence +and incorporate information from other +positions, right? So, the core attention +component in some sense. +Um, so there are lots of different ways +that you can encode position into a +transformer. +And just so you know, to to remind you, +right? This is very, very important +because attention is positionally +independent, right? They're just inner +products, so you can just shuffle them +and attention would be the same if you +don't have a position embedding. +The original transformer had sine and +cosine embeddings, kind of like a +Fourier transform intuition that if you +have sines and cosines, then you can +kind of recover position from that no +matter what. +Um, a number of other sort of large +models that, you know, followed soon +after that used absolute embeddings, +where each position had its own +different embedding. +Um, and then, uh, several other sort of +Google models like to use relative +embedding. So, in here you're not +adding, um, embeddings into the into the +embedding, uh, like word vector +embeddings, but instead you're adding a +s, but instead you're adding a +vector to the attention computation +itself, right? So, if you're three +positions off, sort of the attention +matrix gets a different offset added to +it. And and, you know, models like T5 +and Chinchilla use kind of this scheme. +Um, +the thing that has really become pretty +dominant in terms of position embedding +is this class of embeddings called rope, +which some of you may be familiar with. +Um, most models past 2024 use this type +of embedding. And it's kind of +remarkable given that rope, you know, in +some ways came out of nowhere. +Um, originally I think this was also a +GPT-J innovation, um, +from I think, uh, sort of not very well +known sort of blog post and, uh, paper +combination, uh, from an author in +China. Um, but really it has some really +interesting ideas for for why you would +do something like rope. +So, rope, you know, is a relative +position embedding. And a relative +position embedding, let's make an +, let's make an +opinionated stance that I should not +care about the absolute position of any +words. So, if, you know, A uh, an apple +appear together, even if it appears at +the start or at the end, right? In rope +embeddings, they should kind of get the +same, uh, sort of result. Um, and we do +know that, you know, or and we want to +sort of represent it in this way, right? +So, I have an embedding F, and I have +another embedding F, and these are going +to take in the identity of the words X +and Y and the positions absolute of I +and J. +And I want this to be equal if I take +the inner product of these embeddings to +be equal to a function that only depends +on the relative difference. +Right? Um, and every existing embedding +before it didn't really fulfill this +equality. Like sine is not relative +because it has these absolute cross +terms that are not relative. Absolute +position embeddings, just by the the +name of it, is obviously not relative. +And then relative embeddings, +technically these are relative, but +they're not kind of embeddings because +they're just adding to the attention +just adding to the attention +matrix, right? So, there's no inner +product structure that, you know, you +can extract out of the +So, given this, you might ask, is there +a nice way that we can truly have this +relative embedding? +And the idea is very cool. Um, it's +really just looking at kind of uh, +properties about angles and cosines. So, +we want our embeddings to be invariant +to absolute positions, and we know that +inner products of any kind are invariant +to arbitrary rotation, right? So, the +idea is to say, I'm going to take my +semantic word vectors, the ones that are +are independent of any position. So, +this is my starting point. And then I'm +going to rotate each of these vectors, +in this case in 2D, um, based on the +position that the words appear. So, you +know, just as a +uh, simple example, we, let's say we +have the uh, sentence, we know that, +right? We appear at position zero, so +I'm not going to touch that at all, +right? I'm just going to keep that where +it is. +The word know is at position one, so I'm +going to rotate it by some angle, right? +And that's my my one position rotation. +Now, what happens if I apply the same +idea to, uh, the following sequence, of +course we know, right? In this case, we +and know are still adjacent, they're +right next to each other, but their +absolute position is shifted, right? Of +course, you know, comes before we know +now. In this case, I'm going to rotate +the word we by two positions because +it's two index, right? 0 1 2. So, the +word we is in the second, uh, position +number two, so I rotate by two. I rotate +know by three positions cuz it's in +position number three, and what do you +know, the relative angle between these +two is still separated by one, right? +So, this is a very, very simple idea of +just using rotations, uh, to represent, +um, +position. And if we do that, then +anytime we take an inner product, those +inner products are going to be invariant +of absolute positions. +Now, you might say, well, in two +dimensions that's pretty easy cuz you've +only really got one choice, you got +clockwise and counterclockwise, but in +high dimensions, there's an infinite +space of ways that you can rotate +vectors. So, what do you do in D +dimensions? Um, well, you do the +simplest possible thing and it works. +The simplest possible thing is to reduce +it to the 2D case repeatedly. So, you +have a D-dimensional vector, just cut it +up into chunks of two, and each pair of +two dimensions gets rotated. And the +theta at which these things rotate vary, +right? Some of them are very low +frequency, so they rotate very slowly, +so they uh, they can capture long-range +dependence. Some of them rotate very +quickly, so they capture things like, +are they neighbors to each other, right? +Um, and then at the end, you know, after +I've rotated every pair of vectors, I +get sort of my final embeddings. So, +this is the, you know, rope approach. +the paper, if you read it, has a very +complex motivation about complex +numbers, but really I think the +intuitive way, at least to me, to think +about it, is to just you want to rotate +by reducing to the two-dimensional case, +and you're just rotating every pair of +coordinates. +Gemma 4 just came out on Thursday, and +they have like another different kind of +fun thing that they do, which they call, +um, I think proportional rope or P-rope, +um, which is a really strange way to +just say that the only thing they rotate +is the first two coordinates, but that's +another valid thing that you can do as +well. So, there's a lot of different +things that you can do in this space +that end up working. +Okay. In practice, what you're going to +end up doing is, you know, you can take +your vector and you can make a sparse +multiply with sines and cosines, and +this is going to be giving you some way +of rotating your input vectors X's, +right? So, X times, uh, W times R, this +is going to be your final embedding that +you get. +Um, and finally, you know, this is a +sine and cosine, which looks a little +like sine embeddings, but it's really +important that I'm multiplying with +these sines and cosines rather than +using them as embeddings cuz that means +that there are no cross terms. Um, and +this is purely relative, right? There's +no absolute position information that +you'll get out of inner products. +Um, if we really, really wanted to get +into low-level details and you ask like, +how do I actually implement this thing, +you know, you're going to have to do +that. Um, you have your usual attention +stuff, and then what you do is you +generate cosine and sine angles, um, +based on the position IDs of sort of +where your sequence is, and then you're +going to apply those cosines and sines +onto both your queries and keys for your +sort of attention computation, um, and +you can either apply them as a matrix +multiply or you can go through and apply +them manually, uh, just as a rotation, +right? Fairly straightforward, and you +would do this at the attention level +rather than at the very bottom to sort +of enforce position invariance every +time you're doing attention +computations. +Okay. So, that was rope. Um, it is a +little bit confusing, but once you +understand the geometry of just rotating +things, it's actually fairly, uh, +straightforward. +Okay. I'm going to pause here for one +moment, um, in case anyone has any +questions about the various like +architecture bits. Um, we're going to +then talk about even lower-level details +about hyper parameters. So, yes. +Do you know about any papers that do a +higher-dimensional rotation? +Higher-dimensional rotation never +worked? +It's a good question. +I don't think so. By a +higher-dimensional rotation, like any, +you know, 2D rotation in the space would +just be kind of a variant of this. You +could certainly do like any one manifold +that like is a closed loop. I have not +seen that. +Yes. What do you recommend for this? +What do you think is the best way to +distill this kind of knowledge problem. +People who are back to work boards. +It's a good question. Um +I don't know if there's a way beyond +some combination of like looking broadly +enough to get a to get to get a pattern, +which is what I the procedure I'm trying +to do in this lecture here. And then the +other one is to try it yourself even a +much smaller scale to form an intuition +and like a theory for how these things +come together. I think those two are +really the right ways. I think reading +any single paper in isolation is very +very difficult especially now because no +single paper seems to give any full +detail for a lot of language models +these days. +Oh, lots of questions now. Okay, good. +We'll go in Yeah. Um so I have a +question about the question on the +parallel layers and the the serial +layers. Yeah, I understand the modern +models are +thinking of the resource efficiency. So +they will use the parallel layers. They +have the +idea but there's there's a difference +there's a there's a big difference +between the accuracy um for these two +patterns, right? I want to know like +what's the What's the +What's the difference of of the +accuracy? Is it big enough to Is it +small enough to allow the current model +trainers to ignore that or is there any +problem? Yeah, I think you know, the +that's actually really mixed. So if you +read the original Palm paper, I think +they're like very confident about the +use of parallel layers like no +performance drop 15% systems utilization +improvement. So if you read just that +you'll kind of say like oh, it's just as +good. Um but I think a lot of the the +later Google models have stopped using +this, which you can take on as an +implicit signal that actually there +might be some losses. And once again, +this one is a little bit hard to um to +get precise numbers on because no one's +done the ablations as far as I know on +parallel versus serial um controlled +nice ablations at least. +Yeah. +So yeah, so +what's the difference between like Eagle +and RoPE? +Yeah, yeah. I mean this difference is +really just like which of the +coordinates you're rotating. +Like you don't rotate most of them +because a lot of the I mean the argument +originally I think is that the low +frequency parts just aren't rotating +very much. And so you can drop them if +you're really strapped for you know, +sort of extra space. And these this is +really a optimization for teeny tiny +models where like you don't have very +much like hidden dimensions to to have +activations for. +For the relative embeddings not having +an inner product, +um is that cuz it only applies to keys +specifically? I'm trying to understand +the logic. Yeah, so they applied both +the the keys and values, which is kind +of why you know, you get this like +relative effect from where you are. Um +you want to not have cross terms, right? +So so if you look at the sine and cosine +embeddings, then you'll not only get +sort of the you know, the original +vectors, you'll kind of get these weird +cross terms between the position +embeddings and the word embedding +themselves and so on and so forth. And +then you can kind of back out what the +absolute position is. So even sine and +cosine embeddings are not like pure +relative position embeddings. Um +you know, you have to accept the premise +that you know, the relative embedding is +what you want. But once you do kind of +you end up at the RoPE solution somewhat +naturally. +So what's the issue with So the issue +with this is that it just can't be +factorized as an inner product. That's +more of an aesthetic problem, right? +Like if if your constraints are I need +it to be uh relative and I need it to +factorize as f of xi and f of yj, then +this is not a solution in that class. Um +to be fair, there's a lot of um +embeddings that work this way that do +work like Alibi and other kinds of like +approaches like do do this kind of +inject into the attention matrix and +they do reasonably well. Um it's not +necessarily the one that's become the +dominant approach is what I can say. +Cool. Okay. +Great. +Now we'll talk about hyperparameters um +and I think hyperparameters are really +something that you start to engage with +once you like actually have to train a +model, right? When your knowledge about +language models are abstract, you don't +have to care about any of these. But +once you have to instantiate it, you +start to ask questions like well, how +big should the feed forward size be? +Um how many heads should I have? +Um what should my vocab size be, right? +and you might also have questions of +like what should my weight decay or +dropout be? Like do I even need to +regularize? I I have a lot of tokens, +right? So do I need regularization? +Um and do I need very deep models or +very wide models? Like what are the the +right kinds of things to do here, right? +Um and all of these if you start out +with no knowledge, it's actually very +daunting because you have to search this +like very big high dimensional space. +Um but the space of things that people +try is actually pretty small. And from +that maybe you can start to think about +you know, smarter search processes of +like where you want to vary things. +One of the things that's a really +consensus hyperparameter +um is this idea of the uh +ratio between the feed forward size, +which is kind of the output of your +first matrix multiply in an MLP, and the +model dimension, right? So this is +really the the uh ratio of the two +dimensions of your W1 and your as well +your W2 matrix. +Um this seems like a thing that's very +important and controls kind of the +richness of your MLPs. So what should it +be? Well, for whatever reason, it should +maybe be four times your hidden +dimension, right? Um and this is a rule +of thumb that works remarkably well and +I will show you some data on like why +maybe this is a fine number to choose. +There's a few exceptions um and funnily +enough, the really extreme exceptions +kind of backtrack on that. +Um exception number one is variants of +the gated linear unit. I already told +you about this. So if you were thinking +about it, this is probably cached in +your head, right? GLUs have more +parameters, right? If you keep the same +dimensions. So if you want to keep the +parameter size of your MLPs the same, +well, you need to scale down by 2/3, +right? So most GLU variants, this means +that you're going to end up with +something like 2.67-ish, +right? So everyone that's uh down here +2.67 to 2.5, this is roughly applying +this like 2/3 correction. +Um and then for whatever reason, um the +Llama 2 folks decided, well, we actually +have very efficient um attention heads +with like um +uh +uh MQ A, which I'll talk about later. Um +and because of that, we can multiply +this ratio by an arbitrary 1.33 and +we'll get roughly 3.5. And so the Llama +people kind of like arbitrarily chose a +slightly different ratio, which +essentially emphasizes the MLPs a little +bit more. Um but really if you actually +look through all of the the papers, +you'll find you know, either 2.6-ish or +3.5 for GLUs um or four if you're doing +uh non-GLU models. +Okay. There's another exception, which I +find to be very funny but also very very +cool, which is um +you know, throughout as you read these +like technical reports, you'll find that +most people are just very boring in +their choice of architectures. They're +like we did Llama but we changed one +thing. Um but you know, folks at Google +are very bold sometimes um and T5 is one +of my favorite ones because they have +some really bold settings. Uh they +decided that um instead of following +this like 4x rule of thumb, they decided +that they want to have a 64x +multiplier, which is like way bigger +than four. Um and they have a reasonable +argument for this as well. This is +another like systems-based argument, +right? They said, well, you know, if the +bigger my matrix multiplies, the more +efficient I can keep my hardware. So if +I make this, you know, multiplier really +big, then you know, my matrix multiplies +can potentially be sort of more +efficiently utilized, right? +Um and some others like Gemma 2 have +also tried to really push a little bit +higher on this. But really uh T5 is an +kind of astounding exception at 64. I +don't think any other model has really +gone that high in the feed forward +multiplier. +and empirically, if you look at other +sort of works that try to do more +controlled comparisons of this ratio, um +I've taken this one from Kaplan in 2020. +This is the classic uh neural scaling +laws paper um where they they do sort of +various controlled uh studies on +language models. You'll see, you know, +this wasn't the point of the study, +right? It was a scaling laws study. But +you'll see in one of the panels that +they actually have a sort of ablation or +sweep where they change the feed forward +ratio and they look at the loss, right? +Um for a very small model here, right? +But what they what they find in this +paper is there's a basin where you start +at about one and you end up about maybe +10 where this hyperparameter is like +pretty good and very very flat. You lose +very little relative to the optimal loss +down here, right? Um and then if you get +it really wrong, like you get, you know, +above 10 to 100 or something like that, +you know, then your loss starts really +shooting up quadratically. +Um and so a lot of these choices that +range between like 2.6 to four, they're +all kind of falling into this relatively +nice basin. So you're fine choosing +those numbers, right? +Okay. So what can we learn about this +hyperparameter? Well, the default +choices have worked very well for nearly +all modern language models. So you can +safely choose that. Um T5 was a fine +model or the Virgin 1 T5 was a fine +model, right? Like it wasn't a bad +, right? Like it wasn't a bad +model. Um and so even radical choices +. Um and so even radical choices +can technically work, but it's probably +going to be compute inefficient. And I +think the funniest part of the saga of +this kind of the punchline of the T5 +saga to me is that they have a follow-up +model T5 1.v v1.1 um that's like +supposed to be the improved version of +T5 and they kind of go back to the +standard 2.5 multiplier, you know? So +there's nothing explicitly stated here, +but clearly, you know, when they tried +to to update T5, they decided that they +wanted to go back to a more standard +multiplier, which I find to be a little +bit funny. +Okay. So, that's the, you know, +feed-forward ratio, +um which now you have like a rough sense +of like what the right order of +magnitude is. +Now, let's talk about a different +consensus hyperparameter. Um I always +found this to be very strange when sort +of teaching uh 224N and, you know, just +sort of teaching students about this, +which is, if you have a multi-head +attention, where you have multiple heads +for your attention in your transformer, +um the canonical thing to do, the thing +that almost everyone does, is if you +have multiple heads, you make sure that +the size of those heads, the head +dimension, is such that you sort of have +the same dimension as a single-head +transformer, right? So, you always make +sure that you sort of divide the hidden +dimension to basically multiply with H. +So, in this case, right, you have H, the +number of heads, and the dimension of +each head is D over H, so you multiply +the two and you get D, right? For some +reason, this is kind of the rule of +thumb. +Um of course, this doesn't have to be +true. We can arbitrarily change the the +ratios between head dimensions and model +dimensions, but most models do follow +this guideline, and it turns out to work +pretty well. Um +You know, we can look at a variety of +different models, classic and new. I, +you know, have the latest and greatest +quad as well, and you kind of find, +yeah, the ratios are roughly around one +a model head. Um you know, notable +exception of T5, um and even Lambda, +which is another Google model, um but +really everyone sticks around one. And I +think this is +kind of an interesting one. +Um I think the thing about head +dimensions uh that I'll that I'll end +with here is I think this is yet another +kind of forgiving hyperparameter. Um +there's a couple of ablations that +people have done. There's once again a +pretty wide basin around one that you +can sort of get away with. +Okay, but that one's maybe not the most +critical uh hyperparameter. +I think maybe one of the most critical +and interesting ones, I think +conceptually, is this idea of an aspect +ratio, right? Um and then sort of to add +an extra point here, +um when you scale models up or down, the +way you usually do that is you fix an +aspect ratio, like how wide your model +is versus how deep it is, and then you +make the whole model bigger, right? So, +the aspect ratio in some sense controls +the entire depth-to-width tradeoff as +you make models bigger, right? +Now, you might wonder how deep should my +model be. Like, if you've been following +all this stuff on like reasoning and so +on, you might think I need a really deep +model or really shallow model if I want +systems utilization. You might think +that there's a lot of sort of variation. +And there is a lot of variation, um much +more so than other hyperparameters, but +there's actually like a fairly clear +sweet spot that most modern models fall +into. +Um you don't really see models go like +too uh too deep, um and you also don't +see models go too wide uh in either +direction, right? You see most models +have a ratio about a hundred um D model +over N layers. Um so, about hundred sort +of width for every layer that you have. +I mean, this is true for like GPT-3 or +LLaMA or any one of these models. +and really, I think the considerations +are partly a tradeoff between +expressiveness and hardware, right? If +you have an extremely extremely deep +model, um they get very very annoying to +deal with systems-wise. The deeper your +model, like, what is the ways that you +have for parallelizing them? Well, you +might have to cut up your layers. If you +cut up your layers, +we'll talk about this in the systems +lecture. Once you start cutting up your +layers um depth-wise, you have very +serious issues in parallelization. +Pipeline parallel, which is what this is +called, is something that like most +people really really do not want to deal +with. Whereas width is much easier to +parallelize. If you have a really wide +model, you know, you can cut that up +very easily in your GPUs. Uh tensor +parallel is what it's called is much +much simpler to deal with. +and so, in some sense, you know, there's +systems reasons to go wide, and maybe +there's expressiveness reasons to go +deep, and you end up at roughly a +hundred. Um and I think one of the +really interesting things about um +transformer hyperparameters is there are +a lot of hyperparameters that seem quite +important, but they're also fairly +forgiving, and people have converged +roughly on the minimum. This is yet +another plot from Kaplan et al., um +which shows another sweep over +hyperparameters for differently sized +Um and once again, you see, regardless +of kind of the size of your model, +roughly speaking, the optimum aspect +ratio is fairly similar, and they live +at about a hundred, maybe a little bit +less depending on how you want to do the +accounting, but really, you know, +anywhere near a hundred is a pretty safe +bet for aspect ratios. +Um ETA and others uh did a number of +really interesting sort of like +architecture um +architecture variation experiments, in +which their general conclusion on this +was that let's look at the top panel +here. You have a lot of different kinds +of +uh models that you can have in terms of +depth-to-width tradeoffs, um but as you +sort of sweep the depth-to-width +tradeoffs, you find that really, um the +only thing that matters in some sense is +FLOPs. As you increase the FLOPs, the +models get better, and that's really +controlling the majority of the effects, +not necessarily uh the aspect ratio. And +so, I think what has really emerged from +this is the sense that there's a general +forgiving band of hyperparameters that +people tend to choose, and then you +really worry about primarily your +systems utilization rather than sort of +expressiveness concerns, which are hard +to reason about. +Um okay. And then maybe the last +hyperparameter thing uh that I want to +mention is vocabulary sizes. Um +and this one's kind of interesting to me +because there's a really clear +difference between two classes of +models. Um I think in the early days of +a lot of, you know, um +uh early days of open-source model +training, um +there were a lot of monolingual models +whose only goal was to be good on +English. And for those models, you had +these like much smaller vocab size, in +the 30,000 range. Um and then, +post-LLaMA, a lot of people were really +interested in multilingual or like +production systems. So, these include +closed-source models like GPT-4. Um all +these have much much larger vocab sizes, +and these are roughly in the hundred to +200,000 +um vocab range. +And you see generally that, you know, +Google models have a ton more vocab. Um +LLaMA derivatives roughly range at about +a hundred uh thousand tokens, and then +the the sort of monolingual models are +about 30,000. +Um this is somewhat clear. The +multilingual models really do need much +larger vocab to cover the whole space. +Generally, the models on the right are +also bigger. There have been scaling law +studies showing that the bigger your +model, the larger the vocab it can +handle, and so this is also partially +driven by uh modern scaling trends, +where the models on the right are +generally bigger. No one's training +small uh sorry, large monolingual models +uh anymore. +Okay. So, um yeah. +Sorry, uh the question was like, if you +have Sorry, multilingual models or +sorry? Multimodal. Multimodal. Yeah, so +I guess it depends on the way that your +tokens are encoded, but, you know, if +you're tokenizing your images and things +like that, then you need to, you know, +have many more tokens to account for +those. Um if you look at like various +open-source releases, they'll have like +a different image tokenizer with its own +vocab, which is quite large. +Uh how valid is it to +compare +bits uh bits per byte for different for +different tokenizers? +how valid is it to compare bits Oh, that +is a great question. Okay, yeah. Uh +that's not a hyperparameter question, +but that is a good question. Um so, +what is the right way? Okay, so so, let +me let me like +step back a moment and like put us in +the right mindset. So, if we think about +language modeling, language modeling is +is a generative modeling task, right? We +are modeling the probability of a +sequence. +Now, as long as your sequence is fixed, +right? It's the same. You have you know, +adulterated it anyway, and you provide a +probability over all strings, that's +always valid to compare, right? At that +level of things, it's always valid. +Now, when you ask the question, is it +valid to compare the bits per byte of uh +arbitrary token or or two arbitrary +tokenizers? +Really, there's two things at play. The +one thing is, you know, did you touch +the sequence at all? Like, if you look +at some tokenizers in the past, before +subword tokenizers, they would drop some +tokens or drop some words. That changes +that makes the comparisons invalid. But +modern tokenizers are complete. They can +model any sequence, so that's not a +concern. Um the other thing that you +have to worry about is, are we like +length normalizing it in any way, right? +But for bits per byte, you're always +normalizing with the same number, which +is the number of bytes, and so this is +always a valid comparison, right? So, +that's kind of how to think about, you +know, tokenizer comparisons. So, for +example, +uh I think they had the results showing +that +comparing perplexity for fixed +tokenizers +is is is is um +always leads to to better actual +performance. +On on downstream network tasks. +Is the same thing they were looking for? +Um perplexity and BPD are kind of dual +to each other, so yes, if that's what +you're asking. +It's only yes and only no, cuz if you're +comparing +you could two frame compare the +perplexity as compared +but you're changing it different splits. +Changing it different [snorts] +Okay, we'll have to we'll have to talk +later cuz I'm not sure I understand the +question, but I think that that's an +interesting set of questions. Okay, +good. +All right. +So, um you know, we're we're going +through really the the the low-level +lowest levels of details of language +modeling, which I think has really +exposed a lot of interesting ideas while +we sort of talk through this. And I +think dropout is one of the end +regularization, I think is another very +interesting class of ideas. Also one +that I think is very counterintuitive +from your machine learning 101 +intuition. +So, let's uh +go through what I think is like the the +standard argument for, you know, +regularization. Um well, if I'm doing +language modeling, I have a lot of data, +right? I have more data than I can +process most of the time, right? Unless +you're at, you know, Google, maybe even +then, there is more internet data than +there is flops. So, you know, I'm +probably not even going to see the same +data twice, right? Um so, I'm only going +to do a single pass on a corpus, and +there's very good reasons and arguments +to believe that a single pass of SGD or +other optimizers is never really going +to memorize my data very much, right? +So, this means overfitting is not really +a problem uh almost ever during compute +constrained language modeling. +Now, you know, some people even actually +only look at training loss because they +believe so strongly that overfitting +doesn't happen in single pass SGD. +Now, given this, you know, you can sort +of sit and think about this. Should I +use dropout or weight decay in language +model training, right? +Okay, you can think about it a bit. +you know, one unfortunate thing is that +a lot of recent models don't talk about +this stuff at all. Um it's really +lower-level details than like tech +reports are willing to expose. +Um but if you look, actually you find a +lot of models um do both. Especially +weight decay actually is a fairly +popular intervention even for modern +high-performance language models. Um +this is very, very surprising, you know? +I mean, some of the dropout things um +you know, maybe +uh have gone out of favor, but weight +decay actually remains fairly popular. +And this is very mystifying. Like, why +is this? +Um and this is, you know, one of the +reasons why I think deep learning is +hard and this architecture lecture is +very strange and hard. Um it's because +these things interact in very strange +ways. So, there have been papers that +have argued um and shown nice evidence +that weight decay is actually not a +regularizer sometimes. It actually +interacts with the optimizer to +essentially um +make optimization better. Um so, if you +look at the training versus validation +loss across different weight decay +settings on, you know, language model +training for single pass SGD, you don't +really see any difference. Like, weight +decay isn't shifting things so the +validation loss is better. There's +already no overfitting. We're on the x +equals y line here, right? So, doesn't +control overfitting, +but if we kind of look at different +levels of weight decay, and not only +just different levels of weight decay, +we look at weight decay combined with +learning rate decay, um what we find is +that the stronger weight decay runs, +these blue dash lines on the bottom, you +know, do significantly better because +they start out slow, but they +essentially end up um converging to a +much better minimum later. And this is, +you know, generally true when we decay +learning rate, not necessarily true when +we're in constant learning rate, which +is maybe somewhat more of where your +intuition is coming from. +So, you know, this is part of why it's +very difficult to reason sort of a +priori or like from scratch, you know, +the behavior of all these different +choices and why, you know, I think Percy +and I have designed this class so that +you interact with stuff because, you +know, you might come upon this thing +that where basically weight decay is +actually an optimization intervention +and not necessarily a regularization +intervention, which is, you know, what +you would expect here, right? So, always +keep that in mind that these kinds of +unexpected effects can really start to +kick in +uh for these kinds of uh settings. +All right. So, to put everything +together for hyperparameters, there's +actually for, you know, a lot of the the +maybe more hairy-looking +hyperparameters, actually just fairly +standard choices that have worked well +for everybody, right? You know, factor +of four rule of thumb, keep your head +dim and your number of heads uh equal to +the model dimension, um pick an aspect +ratio roughly around 100, um +and, you know, if you ask about +regularization, right? You want to maybe +try a couple things cuz regularization +actually does interact with optimizers +in ways that are quite counterintuitive, +right? So, this is the thing that some +people uh still do even though you you +don't need the regularization at all. +Actually, maybe I'll stop here in case +yeah. +Are there any significant differences +maybe for like um +the future models? +Ooh, diffusions. +That I have not looked into enough, to +be honest. Um there aren't that many +people training big diffusions is one +issue. Um and many of the models that +have been trained are retrofitted cuz I +think the architectures are actually the +same as the, you know, like a Lama-like +model. Um but if you're asking the +question of like, what's the optimal +architecture if you were to train from +scratch, I don't know what that is +actually off the top of my head. +Yeah. Do you have any explanation for +why regularization works in some cases? +Well, I guess it's not that +regularization in general affects +optimization. I don't think people do +dropout anymore because, you know, it +doesn't really uh interact well with +optimization. But for example, weight +decay, you know, which is shrinkage to +zero, um that might allow you to use a +higher learning rate or it might allow +you to decay faster. There are lots of +ways in which all these terms are +interrelated. +Now, I've talked a lot about how to +design um expressive models by sort of +looking at all these other models that +have been trained. +Um one of the things that I'll I'll +highlight now is over the last few years +um a really big emphasis has not been on +performance alone. It has actually been +on stability. And this becomes an +increasingly important concern as your +models get more and more expensive to +train, right? Um we've kind of seen that +a lot of these choices are forgiving, +right? Everyone's kind of doing similar +stuff. And so, you know, you can mess +with these, but you're not going to get +a big performance difference. That's +fine. But if your model, you know, +suddenly blows up some part into +training, like you get these like +horrible-looking spikes all over the +place, um you know, you might end up +with a model that is, you know, actually +not very good quality, right? Or it +might be unrecoverable. You might have +spent, you know, millions of dollars in +training, and, you know, you get to a +point where the model is no longer able +to be trained any further, right? That +would be a horrible thing to happen if +you have a lot of compute that you want +to spend. +So, you don't want to train models that +look kind of like this blue curve with +like spikes everywhere and these, you +know, big gradient norms happening. Um +so, what do we do to fix these stability +issues? I mean, this is really, I would +say, like a core core issue. +And, you know, if you have stability +issues in language models or in general +neural networks, there's a few, you +know, usual suspects that you've got to +start looking at. +Um one of them is the soft maxes, and +the soft max has two things that are +both really bad for stability. One of +them is an exponential, right? We can +see how that blows up very quickly. Um +you also divide two numbers, and that's +also a potentially very dangerous +operation, right? So, a soft max is one +place where you got to be extra, extra +careful. +And where are the soft maxes in a +language model? Well, there's two of +them. There's one on the output side +when we output our probability +distribution, and then in attention when +we normalize the attention, there's +going to be another soft max, right? So, +we can think of both of those as really +kind of danger zones for our model, um +especially the attention. +But okay. +Let's start with thinking about the +output soft max. The output soft max can +blow up on us. Um +and one of the things that we can do is +we can try to control +um sort of the the normalizer problem. +So, you know, let's sort of think about +the soft max calculation. We want to +compute a log probability to compute the +loss. Now, what is a log probability? +Well, it's, you know, the output of your +model U, and then you've got this log +normalizer, right? This U is +well-behaved because in some sense this +is the output of your model, right? This +is just the output of your residual +stream with all the things that are +added in. So, if U is well-behaved, then +log P, the first term, is well-behaved, +right? If the model is being okay. +Now, the second term, this log Z, this +might not be so okay, right? If Z is +really big or really small, even if the +output of your model is somewhat +well-behaved, it could blow up. And what +is Z? Well, it's an exponential, right? +So, it could potentially blow up very +quickly on you. +Or if this is zero, it could also blow +up on you, right? So, both of those +directions are very, very bad. +Now, we would ideally like our Z to be +somewhere near one, right? +Um or log Z to be somewhere near zero. +Um what can we do? Well, one of the +things that you notice, right? If you +sort of thought about the action of the +soft max, is this whole thing is +overparameterized, right? Um I could +sort of push things in and out. So, if I +add a constant to U, I can manipulate +the Zs without really affecting the +output of the soft max, right? You can +cancel out between the normalizer and +sort of the output of my model. +So, because of that property, one thing +that I could do is I could add a +regularizer. Um this is from from Jacob +Devlin's paper 2024, uh sorry, 2014, um +in which he adds sort of this squared +log Z term. Um +and what this is doing is it's just +penalizing how far away your log Z is +from zero. And if log Z is near zero, +that's nice because this whole +expression is kind of numerically +stable. +This is called the Z loss trick. Um it's +been used by a number of papers. Um +Jacob Devlin and others uh sort of +popular or initially pioneered this back +in 2014, and then it's become popular +again through a number of open-source +models. Like, Baichuan I think was the +first open-source model to do it, but +then DCLM and Almo and others have been +using this trick to stabilize their +output soft maxes. +So, this is this is a surprisingly +effective thing. +Now, okay. So, let's say we've handled +the instability issues on the output +soft max. +Now, we have to sort of turn our +attention towards the other potential +problem, which is attention, right? And +this is a a place where lots of +degeneracies happen. Lots of techniques +have been developed to control the +instability that attention operations +generate. +Um and really the, you know, the +high-level thing that I'll say +is +if you have instability, if you can +throw a layer norm in there somehow, it +might control it. And that's really in +some sense the the design philosophy +behind this idea called the QK norm. Um +so, what you do is remember that we +have, you know, our Qs and Ks um that +are going to be multiplied together, and +then they're going to go into the soft +max, right? So, in the standard +attention operation, I'm going to layer +norm as a pre-layer norm, multiply with +a QKV, and then I'm going to get my Qs +and Ks. Those will get multiplied by a +matrix multiply, I'll soft max them, and +I'll multiply that with V to get the +weighted average, and then I'll output +whatever comes after, right? So, this is +our usual attention. Now, what happens +if we just throw in a layer norm before +we multiply the Qs and Ks? If we do +that, then we know that the inputs to +this matrix multiply, and therefore the +inputs to the soft max, roughly have the +same scale. They're always going to have +a scale of roughly one because we've, +you know, used RMS norm to divide the +size of those Qs and Ks. +Okay. If we do that, then, you know, +we're kind of going to keep this soft +max operation stable. Tons of different +models do this. It's originally from +the multimodal world. +You know, some folks who were doing +making multimodal models sort of +initially discovered QK norm. +E to fix and chameleon really, you know, +used this and like proved it out. And +then a number of other open-source +language models, you know, realized that +the same tricks are entirely applicable +to +stabilizing attention for language +models, and I think this is now very, +very standard. Like QK norm is actually +a very standard intervention that most +of the large models now introduce. Um it +doesn't seem to affect performance um +from lots of different training runs, +but it does definitely prevent the kinds +of um attention degeneracies. Um and, +you know, I I'm really +>> [laughter] +>> the the way that I've seen this is, you +the the way that I've seen this is, you +know, we have layer norms initially in +the pre-norm. Now, we add them after the +the +nonlinearities in each block, and now +we're throwing them in both the Qs and +the Ks. And really, I think this is is +is kind of getting at +the stabilization tricks that people +apply to this world. Okay. +Now, +um the final set of things that I'll +talk about as a stability intervention. +And frankly, this one is not as popular +and more of a of a Google-specific trick +that I've seen. Um but uh logit soft +capping is a much harder intervention +that some people apply. So, this one, um +you know, in QK norm, what we're doing +is we are controlling the inputs to the +soft max and sort of hoping that the +outputs are well-behaved. If we really, +really want to enforce +um well-behaved outputs, what we can do +is we can kind of take the logits, the +things that go straight into the soft +max, and we can just cap them off so +they can never be too large or too +small, right? This is a hard almost a +hard constraint. Um it's called a soft +cap, of course, but a tan H, you know, +is bounded at some value. +Um and so uh this is in the Gemma +models. Um I think both Gemma or like +Gemma's two, three, and four all use the +logit soft cap trick. Um and what they +do is they take all of their logits from +the attention layers, and then they soft +cap them at some value. +Um some Nvidia folks have done actually +quite nice work doing systematic +comparisons of these stability +interventions. Um and what they find is +um if you start with a baseline model, +you can do all sorts of different +interventions, and QK norm is here, and +it does slightly better due to the fact +that you can crank up the learning rate +a little bit. Um but if you do soft +capping alone, you actually end up +losing performance. So, there is a a +quality degradation that happens. This +is a very strong intervention. You can +never express very confident uh signals +in your soft max beyond a certain point. +Um so, it does have some negative +consequences, but this is a very safe +way of stabilizing the outputs of your +attention. Or sorry, the the inputs to +your attention, the logits that go into +the soft max. +So, that's kind of the end of the +stability components. Um I can pause for +a moment here. Um and I'll talk about +sort of various attention things um +after that. +All right. So, +the last thing I want to talk about +today is various interventions that you +can make to your attention head. +Um and as I was saying at the beginning +of this lecture, +um I'm only going to talk about all the +things that you can do to sort of dense +all by all attention today. So, if you +if you were interested in hearing about +state space models or linear time +attention, um sadly today is not the day +for you. +Um the things that I do want to talk +about, which are really commonly +implemented um attention interventions +today, are uh group query attention, +which really saves inference cost by +reducing the number of heads, um and +sparse or sliding window attention, um +which really originally came from the +GPT-3-ish +family, but have now really been adopted +widely by most models that are looking +to do uh long context unless they're +doing exotic uh SSM stuff. +So, I'll start with um group query +attention or GQA or MQA. Um +this I'm going to first set up the need +for these kinds of things, and then +you'll kind of hopefully see what the +what the trick is um and why it's fairly +natural. +So, for the moment, we've been talking +about, you know, training and modeling +and all these things, but like let's +take a pause, and now let's think about +deployment, right? You train this very +big model, and now you need to serve it +to lots of users, and you're going to +pay a cost for serving. +And you're going to have to, in abstract +sense, pay for two different resources, +right? You're going to have to pay for +your flops, right? The computation that +you're performing, but you also have to +pay for another thing. You have to pay +for your memory accesses, right? Because +the memory accesses are also going to +impact, you know, your system's +characteristics, your latency, your +utilization, right? So, you want both of +these things to be small. +Now, let's think about what happens +during training or alternatively prefill +when you're looking at your prompt where +someone gives you the stuff. In this +case, you know, the total arithmetic +operations you have is, you know, order +of magnitude batch size sequence length +hidden dim squared, right? That's +roughly the size of things that you get. +And of course, you know, we're doing +quadratic attention, so we've got D +squared. +we've got uh +total memory accesses. Like what is our +memory access that we have here? Um we +have batch times sequence length times +uh hidden dim plus um +the sort of cost of the soft max, which +has a N squared component, and then +we've got a D squared component um for +the for the projections. So, the +arithmetic intensity here is pretty +good. Um +it's going to be one over K. This is the +number of heads, so you need to have um +uh sorry, head dims. So, your head dims +need to be big enough that you're +multiplying some reasonably sized +matrices. And you've got a one over BN, +so your sequences need to be long enough +or your batch sizes need to be big +enough. As long as both of these are +true, your GPUs are going to be fully +utilized. Great, right? You're You're +using all of your resources. +Now, you know, we have done we have +finished training, and now we're serving +our users. How do we serve our users? +We're going to generate tokens and send +it to them, right? Now, for doing that, +um I can't parallelize the generation +process. What I'm going to do is I'm +going to generate a token, I'm going to +condition on it, I'm going to generate +the next token, and I'm going to repeat +this process one by one, right? This is +just sort of the the curse of +autoregressive language modeling. We +have to do this. +Um in order to do this, the efficient +way to do it is to maintain all of the +sort of past keys and queries that I've +had in what's called a KV +over the past, and then whenever I need +to compute something new, I can reuse +sort of the the submatrices that I've +already had from the past. And I only +really need to compute sort of the new +um query key interactions that I need to +fill out the rest of this matrix, right? +So, every submatrix I've computed +before, I can keep. I only need to +compute my new ones. So, this saves a +lot on compute, right? That's great. +But, +the issue here is now our arithmetic +intensity is not so good, right? As you +might sort of intuit, this KV cache +approach is going to be reading and and +reading um parameters all the time, +right? Each time I have a new step, +right? I'm going to have to read in my +I'm going to have to read in my +parameters. I'm going to have to take +these dot products, and I'm going to do +this once every step. +And so, now what do I have? Well, you +know, my total memory Oh, sorry. My +total arithmetic operations are the +same. I'm multiplying the same matrices +still, right? Just incrementally rather +than all at once. But because I'm doing +this incrementally, you know, now I have +um a a memory access pattern of batch by +sequence squared by hidden dim plus um +sequence by hidden dim squared. And the +by hidden dim squared. And the +second term is not so pleasant, right? +It used to be that it was just D +squared, but now we've got N times D +squared. And if we compute the +. And if we compute the +arithmetic intensity, which is the ratio +of these two guys, um now we have N over +D plus one over B. So, now what we need +is large batches plus short sequence +length, or we need really big model +dimensions. So, if we want to serve a +small model efficiently, this is not so +Right? Um this is really difficult to +deal with, right? This N over D term, +this first term over here, which is +sequence length over hidden dim, is very +difficult to reduce if we're doing this +incremental computation. This is just a +hard thing to deal with. +So, this leads to this idea of MQA or +multi-query attention. Normally, you +have multiple heads in your attention +operation, and you're going to have +different keys, different values, and +different queries. That's normally how +queries. That's normally how +you do things. +But, one thing that we could do is maybe +we can keep the keys and the Vs the same +across all the heads, and the only thing +that's different across the heads are +the queries. If we do this, then this +drastically removes the amount of items +that need to be moved in and out of +memory, right? Because the KV cache, +right? Are now significantly smaller. +These are all shared across all the +heads. Um this significantly reduces the +total memory access as well as the +arithmetic intensity, and we're kind of +the key term that we were talking about +here, we had the N over D term. Now, we +have uh H multiplying this, right? And +so, this H term allows us to +significantly reduce the the I sorry, +increase the arithmetic intensity if we +have a lot of heads, right? This is a +significant gain over what we had +before. +So, this gets us significant efficiency +improvements, +but the issue with MQA is +this is on the right here, you have one +value and one key for all these queries. +You do in fact lose significant +expressive power if you do this. +And so, there's this trade-off between +system's efficiency and expressiveness, +and you might wonder, is there sort of a +sweet spot in which we can avoid trading +off, you know, quite a significantly +expressive power and computation. And +that's where GQA or grouped query +attention comes in. You know, the +original transformer is multi-head. We +have queries and keys for each head. In +multi-query, we have one key and value +for each for all the heads. In grouped +query, we reduce the amount of keys and +values, but we keep the number of +queries the same. So, we now have this +ratio that we can play with, which is +kind of the number of key heads or the +number of value heads +while keeping the total number of heads +much larger than that. So, this allows +us to very simply control the +the trade-off between expressiveness and +inference efficiency. +Um there are other sort of tricks from +DeepSeek-V2, multi-head latent +attention, that I'll sort of mention +briefly next time, +which sort of have a different kind of +factorization structure and a different +set of trade-offs. But really, +the nice thing about GQA is that in +practice the trade-off is quite +favorable. So, if you have multi-head, +your performance, this was you know, in +the I think T5 days if I remember right. +This is your downstream model +performance. This is your time per +sample. You want to reduce this as much +as possible. +With multi-head attention, you have best +performance but very high cost. +With MQA, you know, you have +lower cost but much lower performance. +Similarly, if you make your model +smaller to try to hit your performance +targets, you get much worse performance. +GQA really does get the best of both +worlds, you know, very low inference +cost, nearly the same performance as +your full multi-head. +Um and you see sort of this like GQA +group structure where if you have +a small reduction in the number of +heads, +you basically have most of the gains in +your performance, which allows you to +sort of keep most of the expressive +power +while getting significant inference +improvements. And Percy will talk a +bunch more about sort of the inference +mechanics later, +but sort of this should give you a +flavor of like why models today almost +all adopt this GQA structure because it +gives you a lot of this inference cost, +which is really critical, without very +much of a +expressiveness hit. +Cool. Any questions for for GQA or KV +cache? Yeah. +Given +that you have so many +like rules of thumb for what +hyper-parameters are good, like to what +extent are you still searching over +hyper-parameters versus exploiting these +rules of thumb that you +I think it's a mix of both. I think +every sort of model training run has +some theses about what can be varied. +And so, you see this in a lot of the +reports where I think the +hyper-parameters are often not where +people are touching too much. +But you see like architecture changes +like one at a time in a lot of these +reports. But it's very rare to like go +and change everything up. I think Google +is one of the only orgs that seems to +like really spice things up in a +significant way. +The Gemma series has done some pretty +interesting things. +The most recent Gemma 4 release, for +example, now has like individual +embedding for every layer +in a way to control the trade-offs +between like memory use and flops. +They're very interesting set of things +that they've done. +Oh, yeah, back there. +Do you experiment with like data all +things of these parameters +during training? +During training, +let me think. +Weight decay, yes. Weight decay, people +change in concert with like learning +rate. +That is actually a heuristic that people +do that works very well. +Um other than that, I don't know if +there's a lot of different hypers that +people change during training, +especially because the architecture ones +just make training incompatible. So, you +can't really, you know, change them +while you're you're training. +Yeah, so I think I think weight decay is +probably the one that I can think of. +The others are usually fixed. +Yeah, MQA is uh +it's not just inference time fixed. It's +a pleasure to train. That's right, yeah. +You you you train with a certain number +of keys. +Okay. The last thing I'll talk about is +sliding window attention, which is a +really old idea. Like, you know, GPT-3 +used actually this you know, if you read +the paper, they'll say we alternate +between full attention, which where +every position can attend to everyone in +the past, and a banded matrix style +attention where you can attend to +everyone within a fixed window. +And you know, OpenAI has some early work +on these kinds of like different kinds +of attention patterns that you can use. +But actually, this has become really +really popular over the past year. +This idea of alternating, you know, the +big full attention and a more local +attention actually hits a sweet spot for +how to manage like long context +performance while not paying too much +for inference. +Um I think, you know, the more recent +revival in open models, I would maybe +say Cohere Command A was the first one I +saw do it, where they, you know, had +this like structure where every four +layers they would have a full attention +that attended to everything. The three +layers in between would use a sliding +window attention that would only be able +to look at local structure. And of +course, you know, as you go up sorry, in +this case down cuz they ordered the the +diagram the other way. As you go down +sort of these blocks, you know, you're +aggregating local information into +global ones. The local attentions at the +end can of course access more global +information, but this, you know, allows +you to manage the +the the cost of having a really long +context without having to go for +something like a state-space model or +more exotic intervention. And that's +worked quite well. +Um there's also some innovation where +people change the embedding format for +the long range information where they +get rid of things like rope, so you have +no position embeddings at all. So, +you're really looking almost at bags +where the short range information still +gets position information. So, people do +all sorts of you know, +kinds of interventions involving these +these +both the embeddings and alternating +local and global structure. +Um I'll say that this is a +you know, attention and in general how +to manage the trade-off between long +context and sort of long context cost +and performance is a still an active +area of investigation. It's a place +where the most architecture work and +changes are still being done. Um we see +essentially a bunch of other models +adopt this idea. Llama 4, most recently +Gemma 4, Omo 3, they all do this +combination of sliding window attention +and full attention, in their case using +full rope instead of nope as the +embedding. +So, as I said, this is becoming really +really popular. +Qwen 3.5, which I put on the right, +they're actually a little bit different +because they alternate a state-space +model +called a gated DeltaNet and a full +attention every sort of, you know, one +one full attention every four layer +every four layers. +So, it's the same alternating structure, +but they're using a different sort of +cheap layer. In their case, they're +using a state-space model. I'll explain +what that is next lecture instead of a +sliding window sort of local attention. +But you see this is like I think a new +theme over the past year where, you +know, open models are really trying to +grapple with long context performance, +and the way to do that, at least so far, +is to have these hybrid models that +aren't just global attention, aren't +just cheap attention. They're some sort +of mix in between. And that's that seems +to have worked very well so far in a lot +of these models. +Okay, cool. So, as I was sort of trying +to emphasize, when you look across all +of these models, you start to see a lot +of patterns and hopefully a sense of +general understanding about what things +you can do and what things are good to +folks. Um we also see a lot of +differences in how we handle context and +how we handle position embeddings. +Even tokenization, there's some +differences, right? So, there are +differences across these models, but +across these models, but +there's also commonalities that +hopefully now give you some intuition as +you go out and do your assignments and +and sort of mess with the leaderboard +and so on. +on. +Thanks. \ No newline at end of file diff --git a/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/video.log b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/video.log new file mode 100644 index 00000000..72a36c39 --- /dev/null +++ b/conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/video.log @@ -0,0 +1,16 @@ +# yt-dlp log +# url: https://youtu.be/lVynu4bo1rY +# output: conductor/tracks/video_analysis_cs336_architectures_20260621/artifacts/video.mp4 +# returncode: 0 + +stdout: +[youtube] Extracting URL: https://youtu.be/lVynu4bo1rY +[youtube] lVynu4bo1rY: Downloading webpage +[youtube] lVynu4bo1rY: Downloading android vr player API JSON +[info] lVynu4bo1rY: Downloading 1 format(s): 400+251 +[download] video.mp4.f400.mp4 +[download] video.mp4.f251.webm +[Merger] Merging formats into video.mp4 + +stderr: +WARNING: yt-dlp EJS not enabled; some formats may be missing.