Merge branch 'master' into core-simd-indices-redadd-redmul

2026-07-15 23:51:25 -07:00 · 2025-05-05 16:37:02 -04:00
parent b0f53a6eaf 2224911aca
commit 9814370659
12 changed files with 174 additions and 515 deletions
@@ -298,7 +298,7 @@ simd_masked_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)
 simd_masked_expand_load    :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
 simd_masked_compress_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---

-
+simd_indices :: proc($T: typeid/#simd[$N]$E) -> T where type_is_numeric(T) ---

 simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
 simd_select  :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
@@ -257,7 +257,7 @@ reader_read_rune :: proc(b: ^Reader) -> (r: rune, size: int, err: io.Error) {
 	for b.r+utf8.UTF_MAX > b.w &&
 	    !utf8.full_rune(b.buf[b.r:b.w]) &&
 	    b.err == nil &&
-	    b.w-b.w < len(b.buf) {
+	    b.w-b.r < len(b.buf) {
 		_reader_read_new_chunk(b) or_return
 	}

@@ -12,7 +12,30 @@ import win32 "core:sys/windows"

 INVALID_HANDLE :: ~uintptr(0)

-S_IWRITE :: 0o200
+// NOTE(Jeroen): We don't translate mode flags for Linux when given to `chmod`.
+//               Let's not do so for Windows for `chmod` or `read_directory_iterator` either.
+//               They're *not* portable between Windows and non-Windows platforms.
+//
+//               It also leads to information loss as flags like Archive, Hidden and System have no equivalent there.
+//               We can of course parse them so we can set the `.Symlink` and `.Directory` type, but we shouldn't pretend
+//               that 0o644 is meaningful when returned as a mode.
+//               `C:\bootmgr` as an example has attributes read only, hidden, system, archive. In no way is it sensible to replace that with 0o444.
+FILE_ATTRIBUTE_READONLY            :: win32.FILE_ATTRIBUTE_READONLY            // 0x00000001
+FILE_ATTRIBUTE_HIDDEN              :: win32.FILE_ATTRIBUTE_HIDDEN              // 0x00000002
+FILE_ATTRIBUTE_SYSTEM              :: win32.FILE_ATTRIBUTE_SYSTEM              // 0x00000004
+FILE_ATTRIBUTE_DIRECTORY           :: win32.FILE_ATTRIBUTE_DIRECTORY           // 0x00000010
+FILE_ATTRIBUTE_ARCHIVE             :: win32.FILE_ATTRIBUTE_ARCHIVE             // 0x00000020
+FILE_ATTRIBUTE_DEVICE              :: win32.FILE_ATTRIBUTE_DEVICE              // 0x00000040
+FILE_ATTRIBUTE_NORMAL              :: win32.FILE_ATTRIBUTE_NORMAL              // 0x00000080
+FILE_ATTRIBUTE_TEMPORARY           :: win32.FILE_ATTRIBUTE_TEMPORARY           // 0x00000100
+FILE_ATTRIBUTE_SPARSE_FILE         :: win32.FILE_ATTRIBUTE_SPARSE_FILE         // 0x00000200
+FILE_ATTRIBUTE_REPARSE_Point       :: win32.FILE_ATTRIBUTE_REPARSE_Point       // 0x00000400
+FILE_ATTRIBUTE_REPARSE_POINT       :: win32.FILE_ATTRIBUTE_REPARSE_POINT       // 0x00000400
+FILE_ATTRIBUTE_COMPRESSED          :: win32.FILE_ATTRIBUTE_COMPRESSED          // 0x00000800
+FILE_ATTRIBUTE_OFFLINE             :: win32.FILE_ATTRIBUTE_OFFLINE             // 0x00001000
+FILE_ATTRIBUTE_NOT_CONTENT_INDEXED :: win32.FILE_ATTRIBUTE_NOT_CONTENT_INDEXED // 0x00002000
+FILE_ATTRIBUTE_ENCRYPTED           :: win32.FILE_ATTRIBUTE_ENCRYPTED           // 0x00004000
+
 _ERROR_BAD_NETPATH :: 53
 MAX_RW :: 1<<30

@@ -122,7 +145,7 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: int) -> (handle: u
 	}

 	attrs: u32 = win32.FILE_ATTRIBUTE_NORMAL|win32.FILE_FLAG_BACKUP_SEMANTICS
-	if perm & S_IWRITE == 0 {
+	if u32(perm) & FILE_ATTRIBUTE_NORMAL == 0 {
 		attrs = win32.FILE_ATTRIBUTE_READONLY
 		if create_mode == win32.CREATE_ALWAYS {
 			// NOTE(bill): Open has just asked to create a file in read-only mode.
@@ -748,20 +771,10 @@ _fchmod :: proc(f: ^File, mode: int) -> Error {
 	if f == nil || f.impl == nil {
 		return nil
 	}
-	d: win32.BY_HANDLE_FILE_INFORMATION
-	if !win32.GetFileInformationByHandle(_handle(f), &d) {
-		return _get_platform_error()
-	}
-	attrs := d.dwFileAttributes
-	if mode & S_IWRITE != 0 {
-		attrs &~= win32.FILE_ATTRIBUTE_READONLY
-	} else {
-		attrs |= win32.FILE_ATTRIBUTE_READONLY
-	}

 	info: win32.FILE_BASIC_INFO
-	info.FileAttributes = attrs
-	if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(d)) {
+	info.FileAttributes = win32.DWORD(mode)
+	if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(info)) {
 		return _get_platform_error()
 	}
 	return nil
@@ -800,19 +813,11 @@ _chtimes :: proc(name: string, atime, mtime: time.Time) -> Error {
 	defer close(f)
 	return _fchtimes(f, atime, mtime)
 }
+
 _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
 	if f == nil || f.impl == nil {
 		return nil
 	}
-	d: win32.BY_HANDLE_FILE_INFORMATION
-	if !win32.GetFileInformationByHandle(_handle(f), &d) {
-		return _get_platform_error()
-	}
-
-	to_windows_time :: #force_inline proc(t: time.Time) -> win32.LARGE_INTEGER {
-		// a 64-bit value representing the number of 100-nanosecond intervals since January 1, 1601 (UTC)
-		return win32.LARGE_INTEGER(time.time_to_unix_nano(t) * 100 + 116444736000000000)
-	}

 	atime, mtime := atime, mtime
 	if time.time_to_unix_nano(atime) < time.time_to_unix_nano(mtime) {
@@ -820,9 +825,9 @@ _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
 	}

 	info: win32.FILE_BASIC_INFO
-	info.LastAccessTime = to_windows_time(atime)
-	info.LastWriteTime  = to_windows_time(mtime)
-	if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(d)) {
+	info.LastAccessTime = time_as_filetime(atime)
+	info.LastWriteTime  = time_as_filetime(mtime)
+	if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(info)) {
 		return _get_platform_error()
 	}
 	return nil
@@ -162,7 +162,7 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
 		}
 	}

-	cmdline_if: if selection & {.Working_Dir, .Command_Line, .Command_Args, .Executable_Path} != {} {
+	cmdline_if: if selection & {.Working_Dir, .Command_Line, .Command_Args} != {} {
 		strings.builder_reset(&path_builder)
 		strings.write_string(&path_builder, "/proc/")
 		strings.write_int(&path_builder, pid)
@@ -178,12 +178,12 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
 		terminator := strings.index_byte(cmdline, 0)
 		assert(terminator > 0)

-		command_line_exec := cmdline[:terminator]
+		// command_line_exec := cmdline[:terminator]

 		// Still need cwd if the execution on the command line is relative.
 		cwd: string
 		cwd_err: Error
-		if .Working_Dir in selection || (.Executable_Path in selection && command_line_exec[0] != '/') {
+		if .Working_Dir in selection {
 			strings.builder_reset(&path_builder)
 			strings.write_string(&path_builder, "/proc/")
 			strings.write_int(&path_builder, pid)
@@ -199,18 +199,6 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
 			}
 		}

-		if .Executable_Path in selection {
-			if cmdline[0] == '/' {
-				info.executable_path = strings.clone(cmdline[:terminator], allocator) or_return
-				info.fields += {.Executable_Path}
-			} else if cwd_err == nil {
-				info.executable_path = join_path({ cwd, cmdline[:terminator] }, allocator) or_return
-				info.fields += {.Executable_Path}
-			} else {
-				break cmdline_if
-			}
-		}
-
 		if selection & {.Command_Line, .Command_Args} != {} {
 			// skip to first arg
 			//cmdline = cmdline[terminator + 1:]
@@ -323,6 +311,30 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
 		}
 	}

+	if .Executable_Path in selection {
+		/*
+		NOTE(Jeroen):
+
+		The old version returned the wrong executable path for things like `bash` or `sh`,
+		for whom `/proc/<pid>/cmdline` will just report "bash" or "sh",
+		resulting in misleading paths like `$PWD/sh`, even though that executable doesn't exist there.
+
+		Thanks to Yawning for suggesting `/proc/self/exe`.
+		*/
+
+		strings.builder_reset(&path_builder)
+		strings.write_string(&path_builder, "/proc/")
+		strings.write_int(&path_builder, pid)
+		strings.write_string(&path_builder, "/exe")
+
+		if exe_bytes, exe_err := _read_link(strings.to_string(path_builder), temp_allocator()); exe_err == nil {
+			info.executable_path = strings.clone(string(exe_bytes), allocator) or_return
+			info.fields += {.Executable_Path}
+		} else {
+			err = exe_err
+		}
+	}
+
 	if .Environment in selection {
 		strings.builder_reset(&path_builder)
 		strings.write_string(&path_builder, "/proc/")
@@ -212,11 +212,15 @@ _file_type_from_create_file :: proc(wname: win32.wstring, create_file_attributes
 }

 _file_type_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: win32.HANDLE, ReparseTag: win32.DWORD) -> (type: File_Type, mode: int) {
-	if file_attributes & win32.FILE_ATTRIBUTE_READONLY != 0 {
-		mode |= 0o444
-	} else {
-		mode |= 0o666
-	}
+	// NOTE(Jeroen): We don't translate mode flags for Linux when given to `chmod`.
+	//               Let's not do so for Windows for `chmod` or `read_directory_iterator` either.
+	//               They're *not* portable between Windows and non-Windows platforms.
+	//
+	//               It also leads to information loss as flags like Archive, Hidden and System have no equivalent there.
+	//               We can of course parse them so we can set the `.Symlink` and `.Directory` type, but we shouldn't pretend
+	//               that 0o644 is meaningful when returned as a mode.
+	//               `C:\bootmgr` as an example has attributes read only, hidden, system, archive. In no way is it sensible to replace that with 0o444.
+	mode = int(file_attributes)

 	is_sym := false
 	if file_attributes & win32.FILE_ATTRIBUTE_REPARSE_POINT == 0 {
@@ -229,21 +233,36 @@ _file_type_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: wi
 		type = .Symlink
 	} else if file_attributes & win32.FILE_ATTRIBUTE_DIRECTORY != 0 {
 		type = .Directory
-		mode |= 0o111
 	} else if h != nil {
 		type = file_type(h)
 	}
 	return
 }

+// a 64-bit value representing the number of 100-nanosecond intervals since January 1, 1601 (UTC)
+time_as_filetime :: #force_inline proc(t: time.Time) -> (ft: win32.LARGE_INTEGER) {
+	win := u64(t._nsec / 100) + 116444736000000000
+	return win32.LARGE_INTEGER(win)
+}
+
+filetime_as_time_li :: #force_inline proc(ft: win32.LARGE_INTEGER) -> (t: time.Time) {
+	return {_nsec=(i64(ft) - 116444736000000000) * 100}
+}
+
+filetime_as_time_ft :: #force_inline proc(ft: win32.FILETIME) -> (t: time.Time) {
+	return filetime_as_time_li(win32.LARGE_INTEGER(ft.dwLowDateTime) + win32.LARGE_INTEGER(ft.dwHighDateTime) << 32)
+}
+
+filetime_as_time :: proc{filetime_as_time_ft, filetime_as_time_li}
+
 _file_info_from_win32_file_attribute_data :: proc(d: ^win32.WIN32_FILE_ATTRIBUTE_DATA, name: string, allocator: runtime.Allocator) -> (fi: File_Info, e: Error) {
 	fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
 	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
 	fi.type = type
 	fi.mode |= mode
-	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
-	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
-	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
+	fi.creation_time     = filetime_as_time(d.ftCreationTime)
+	fi.modification_time = filetime_as_time(d.ftLastWriteTime)
+	fi.access_time       = filetime_as_time(d.ftLastAccessTime)
 	fi.fullpath, e = full_path_from_name(name, allocator)
 	fi.name = basename(fi.fullpath)
 	return
@@ -254,9 +273,9 @@ _file_info_from_win32_find_data :: proc(d: ^win32.WIN32_FIND_DATAW, name: string
 	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
 	fi.type = type
 	fi.mode |= mode
-	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
-	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
-	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
+	fi.creation_time     = filetime_as_time(d.ftCreationTime)
+	fi.modification_time = filetime_as_time(d.ftLastWriteTime)
+	fi.access_time       = filetime_as_time(d.ftLastAccessTime)
 	fi.fullpath, e = full_path_from_name(name, allocator)
 	fi.name = basename(fi.fullpath)
 	return
@@ -286,9 +305,9 @@ _file_info_from_get_file_information_by_handle :: proc(path: string, h: win32.HA
 	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, h, 0)
 	fi.type = type
 	fi.mode |= mode
-	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
-	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
-	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
+	fi.creation_time     = filetime_as_time(d.ftCreationTime)
+	fi.modification_time = filetime_as_time(d.ftLastWriteTime)
+	fi.access_time       = filetime_as_time(d.ftLastAccessTime)
 	return fi, nil
 }

@@ -1759,7 +1759,7 @@ Returns:
 replace :: intrinsics.simd_replace

 /*
-Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
+Reduce a vector to a scalar by adding up all the lanes.

 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
@@ -2511,460 +2511,16 @@ recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where int
 	return T(1) / v
 }

+
 /*
 Create a vector where each lane contains the index of that lane.
-
 Inputs:
 - `V`: The type of the vector to create.
-
 Result:
 - A vector of the given type, where each lane contains the index of that lane.
-
 **Operation**:
-
 	for i in 0 ..< N {
 		res[i] = i
 	}
 */
-indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
-	when N == 1 {
-		return {0}
-	} else when N == 2 {
-		return {0, 1}
-	} else when N == 4 {
-		return {0, 1, 2, 3}
-	} else when N == 8 {
-		return {0, 1, 2, 3, 4, 5, 6, 7}
-	} else when N == 16 {
-		return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
-	} else when N == 32 {
-		return {
-			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-		}
-	} else when N == 64 {
-		return {
-			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-			32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-			48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-		}
-	} else {
-		#panic("Unsupported vector size!")
-	}
-}
-
-/*
-Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
-
-This procedure returns a scalar that is the sum of all lanes, calculated by
-adding each even-indexed element with the following odd-indexed element to
-produce N/2 values. This is repeated until only a single element remains. This
-order is supported by hardware instructions for some types/architectures (e.g.
-i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
-
-The order of the sum may be important for accounting for precision errors in
-floating-point computation, as floating-point addition is not associative, that
-is `(a+b)+c` may not be equal to `a+(b+c)`.
-
-Inputs:
- `v`: The vector to reduce.
-
-Result:
- Sum of all lanes, as a scalar.
-
-**Operation**:
-
-	for n > 1 {
-		n = n / 2
-		for i in 0 ..< n {
-			a[i] = a[2*i+0] + a[2*i+1]
-		}
-	}
-	res := a[0]
-
-Graphical representation of the operation for N=4:
-
-	   +-----------------------+
-	v: | v0  | v1  | v2  | v3  |
-	   +-----------------------+
-	      |     |     |     |
-	      `>[+]<'     `>[+]<'
-	         |           |
-	         `--->[+]<--'
-	               |
-	               v
-	            +-----+
-	    result: | y0  |
-	            +-----+
-*/
-reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
-	where intrinsics.type_is_numeric(E) {
-	when N == 64 { v64 := v }
-	when N == 32 { v32 := v }
-	when N == 16 { v16 := v }
-	when N == 8  { v8 := v }
-	when N == 4  { v4 := v }
-	when N == 2  { v2 := v }
-
-	when N >= 64 {
-		x32 := swizzle(v64,
-			0,  2,  4,  6,  8,  10, 12, 14,
-			16, 18, 20, 22, 24, 26, 28, 30,
-			32, 34, 36, 38, 40, 42, 44, 46,
-			48, 50, 52, 54, 56, 58, 60, 62)
-		y32 := swizzle(v64,
-			1,  3,  5,  7,  9,  11, 13, 15,
-			17, 19, 21, 23, 25, 27, 29, 31,
-			33, 35, 37, 39, 41, 43, 45, 47,
-			49, 51, 53, 55, 57, 59, 61, 63)
-		v32 := x32 + y32
-	}
-
-	when N >= 32 {
-		x16 := swizzle(v32,
-			0,  2,  4,  6,  8,  10, 12, 14,
-			16, 18, 20, 22, 24, 26, 28, 30)
-		y16 := swizzle(v32,
-			1,  3,  5,  7,  9,  11, 13, 15,
-			17, 19, 21, 23, 25, 27, 29, 31)
-		v16 := x16 + y16
-	}
-
-	when N >= 16 {
-		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
-		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
-		v8 := x8 + y8
-	}
-
-	when N >= 8 {
-		x4 := swizzle(v8, 0, 2, 4, 6)
-		y4 := swizzle(v8, 1, 3, 5, 7)
-		v4 := x4 + y4
-	}
-
-	when N >= 4 {
-		x2 := swizzle(v4, 0, 2)
-		y2 := swizzle(v4, 1, 3)
-		v2 := x2 + y2
-	}
-
-	when N >= 2 {
-		return extract(v2, 0) + extract(v2, 1)
-	} else {
-		return extract(v, 0)
-	}
-}
-
-/*
-Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
-
-This procedure returns a scalar that is the sum of all lanes, calculated by
-bisecting the vector into two parts, where the first contains lanes [0, N/2)
-and the second contains lanes [N/2, N), and adding the two halves element-wise
-to produce N/2 values. This is repeated until only a single element remains.
-This order may be faster to compute than the ordered sum for floats, as it can
-often be better parallelized.
-
-The order of the sum may be important for accounting for precision errors in
-floating-point computation, as floating-point addition is not associative, that
-is `(a+b)+c` may not be equal to `a+(b+c)`.
-
-Inputs:
- `v`: The vector to reduce.
-
-Result:
- Sum of all lanes, as a scalar.
-
-**Operation**:
-
-	for n > 1 {
-		n = n / 2
-		for i in 0 ..< n {
-			a[i] += a[i+n]
-		}
-	}
-	res := a[0]
-
-Graphical representation of the operation for N=4:
-
-	     +-----------------------+
-	     | v0  | v1  | v2  | v3  |
-	     +-----------------------+
-	        |     |     |     |
-	       [+]<-- | ---'      |
-	        |    [+]<--------'
-	        |     |
-	        `>[+]<'
-	           |
-	           v
-	        +-----+
-	result: | y0  |
-	        +-----+
-*/
-reduce_add_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
-	where intrinsics.type_is_numeric(E) {
-	when N == 64 { v64 := v }
-	when N == 32 { v32 := v }
-	when N == 16 { v16 := v }
-	when N == 8  { v8 := v }
-	when N == 4  { v4 := v }
-	when N == 2  { v2 := v }
-
-	when N >= 64 {
-		x32 := swizzle(v64,
-			0,  1,  2,  3,  4,  5,  6,  7,
-			8,  9,  10, 11, 12, 13, 14, 15,
-			16, 17, 18, 19, 20, 21, 22, 23,
-			24, 25, 26, 27, 28, 29, 30, 31)
-		y32 := swizzle(v64,
-			32, 33, 34, 35, 36, 37, 38, 39,
-			40, 41, 42, 43, 44, 45, 46, 47,
-			48, 49, 50, 51, 52, 53, 54, 55,
-			56, 57, 58, 59, 60, 61, 62, 63)
-		v32 := x32 + y32
-	}
-
-	when N >= 32 {
-		x16 := swizzle(v32,
-			0,  1,  2,  3,  4,  5,  6,  7,
-			8,  9,  10, 11, 12, 13, 14, 15)
-		y16 := swizzle(v32,
-			16, 17, 18, 19, 20, 21, 22, 23,
-			24, 25, 26, 27, 28, 29, 30, 31)
-		v16 := x16 + y16
-	}
-
-	when N >= 16 {
-		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
-		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
-		v8 := x8 + y8
-	}
-
-	when N >= 8 {
-		x4 := swizzle(v8, 0, 1, 2, 3)
-		y4 := swizzle(v8, 4, 5, 6, 7)
-		v4 := x4 + y4
-	}
-
-	when N >= 4 {
-		x2 := swizzle(v4, 0, 1)
-		y2 := swizzle(v4, 2, 3)
-		v2 := x2 + y2
-	}
-
-	when N >= 2 {
-		return extract(v2, 0) + extract(v2, 1)
-	} else {
-		return extract(v, 0)
-	}
-}
-
-/*
-Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
-
-This procedure returns a scalar that is the product of all lanes, calculated by
-bisecting the vector into two parts, where the first contains lanes [0, N/2)
-and the second contains lanes [N/2, N), and multiplying the two halves together
-multiplying each even-indexed element with the following odd-indexed element to
-produce N/2 values. This is repeated until only a single element remains. This
-order may be faster to compute than the ordered product for floats, as it can
-often be better parallelized.
-
-The order of the product may be important for accounting for precision errors
-in floating-point computation, as floating-point multiplication is not
-associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
-
-Inputs:
- `v`: The vector to reduce.
-
-Result:
- Product of all lanes, as a scalar.
-
-**Operation**:
-
-	for n > 1 {
-		n = n / 2
-		for i in 0 ..< n {
-			a[i] = a[2*i+0] * a[2*i+1]
-		}
-	}
-	res := a[0]
-
-Graphical representation of the operation for N=4:
-
-	   +-----------------------+
-	v: | v0  | v1  | v2  | v3  |
-	   +-----------------------+
-	      |     |     |     |
-	      `>[x]<'     `>[x]<'
-	         |           |
-	         `--->[x]<--'
-	               |
-	               v
-	            +-----+
-	    result: | y0  |
-	            +-----+
-*/
-reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
-	where intrinsics.type_is_numeric(E) {
-	when N == 64 { v64 := v }
-	when N == 32 { v32 := v }
-	when N == 16 { v16 := v }
-	when N == 8  { v8 := v }
-	when N == 4  { v4 := v }
-	when N == 2  { v2 := v }
-
-	when N >= 64 {
-		x32 := swizzle(v64,
-			0,  2,  4,  6,  8,  10, 12, 14,
-			16, 18, 20, 22, 24, 26, 28, 30,
-			32, 34, 36, 38, 40, 42, 44, 46,
-			48, 50, 52, 54, 56, 58, 60, 62)
-		y32 := swizzle(v64,
-			1,  3,  5,  7,  9,  11, 13, 15,
-			17, 19, 21, 23, 25, 27, 29, 31,
-			33, 35, 37, 39, 41, 43, 45, 47,
-			49, 51, 53, 55, 57, 59, 61, 63)
-		v32 := x32 * y32
-	}
-
-	when N >= 32 {
-		x16 := swizzle(v32,
-			0,  2,  4,  6,  8,  10, 12, 14,
-			16, 18, 20, 22, 24, 26, 28, 30)
-		y16 := swizzle(v32,
-			1,  3,  5,  7,  9,  11, 13, 15,
-			17, 19, 21, 23, 25, 27, 29, 31)
-		v16 := x16 * y16
-	}
-
-	when N >= 16 {
-		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
-		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
-		v8 := x8 * y8
-	}
-
-	when N >= 8 {
-		x4 := swizzle(v8, 0, 2, 4, 6)
-		y4 := swizzle(v8, 1, 3, 5, 7)
-		v4 := x4 * y4
-	}
-
-	when N >= 4 {
-		x2 := swizzle(v4, 0, 2)
-		y2 := swizzle(v4, 1, 3)
-		v2 := x2 * y2
-	}
-
-	when N >= 2 {
-		return extract(v2, 0) * extract(v2, 1)
-	} else {
-		return extract(v, 0)
-	}
-}
-
-/*
-Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
-
-This procedure returns a scalar that is the product of all lanes, calculated by
-bisecting the vector into two parts, where the first contains indices [0, N/2)
-and the second contains indices [N/2, N), and multiplying the two halves
-together element-wise to produce N/2 values. This is repeated until only a
-single element remains. This order may be faster to compute than the ordered
-product for floats, as it can often be better parallelized.
-
-The order of the product may be important for accounting for precision errors
-in floating-point computation, as floating-point multiplication is not
-associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
-
-Inputs:
- `v`: The vector to reduce.
-
-Result:
- Product of all lanes, as a scalar.
-
-**Operation**:
-
-	for n > 1 {
-		n = n / 2
-		for i in 0 ..< n {
-			a[i] *= a[i+n]
-		}
-	}
-	res := a[0]
-
-Graphical representation of the operation for N=4:
-
-	     +-----------------------+
-	     | v0  | v1  | v2  | v3  |
-	     +-----------------------+
-	        |     |     |     |
-	       [x]<-- | ---'      |
-	        |    [x]<--------'
-	        |     |
-	        `>[x]<'
-	           |
-	           v
-	        +-----+
-	result: | y0  |
-	        +-----+
-*/
-reduce_mul_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
-	where intrinsics.type_is_numeric(E) {
-	when N == 64 { v64 := v }
-	when N == 32 { v32 := v }
-	when N == 16 { v16 := v }
-	when N == 8  { v8 := v }
-	when N == 4  { v4 := v }
-	when N == 2  { v2 := v }
-
-	when N >= 64 {
-		x32 := swizzle(v64,
-			0,  1,  2,  3,  4,  5,  6,  7,
-			8,  9,  10, 11, 12, 13, 14, 15,
-			16, 17, 18, 19, 20, 21, 22, 23,
-			24, 25, 26, 27, 28, 29, 30, 31)
-		y32 := swizzle(v64,
-			32, 33, 34, 35, 36, 37, 38, 39,
-			40, 41, 42, 43, 44, 45, 46, 47,
-			48, 49, 50, 51, 52, 53, 54, 55,
-			56, 57, 58, 59, 60, 61, 62, 63)
-		v32 := x32 * y32
-	}
-
-	when N >= 32 {
-		x16 := swizzle(v32,
-			0,  1,  2,  3,  4,  5,  6,  7,
-			8,  9,  10, 11, 12, 13, 14, 15)
-		y16 := swizzle(v32,
-			16, 17, 18, 19, 20, 21, 22, 23,
-			24, 25, 26, 27, 28, 29, 30, 31)
-		v16 := x16 * y16
-	}
-
-	when N >= 16 {
-		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
-		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
-		v8 := x8 * y8
-	}
-
-	when N >= 8 {
-		x4 := swizzle(v8, 0, 1, 2, 3)
-		y4 := swizzle(v8, 4, 5, 6, 7)
-		v4 := x4 * y4
-	}
-
-	when N >= 4 {
-		x2 := swizzle(v4, 0, 1)
-		y2 := swizzle(v4, 2, 3)
-		v2 := x2 * y2
-	}
-
-	when N >= 2 {
-		return extract(v2, 0) * extract(v2, 1)
-	} else {
-		return extract(v, 0)
-	}
-}
-
+indices :: intrinsics.simd_indices
@@ -47,6 +47,8 @@ foreign user32 {
 		lpParam: LPVOID,
 	) -> HWND ---

+	GetWindowThreadProcessId :: proc(hwnd: HWND, lpdwProcessId: LPDWORD) -> DWORD ---
+
 	DestroyWindow :: proc(hWnd: HWND) -> BOOL ---

 	ShowWindow :: proc(hWnd: HWND, nCmdShow: INT) -> BOOL ---
@@ -760,6 +760,36 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 			return true;
 		}

+	case BuiltinProc_simd_indices:
+		{
+			Operand x = {};
+			check_expr_or_type(c, &x, ce->args[0], nullptr);
+			if (x.mode == Addressing_Invalid) return false;
+			if (x.mode != Addressing_Type) {
+				gbString s = expr_to_string(x.expr);
+				error(x.expr, "'%.*s' expected a simd vector type, got '%s'", LIT(builtin_name), s);
+				gb_string_free(s);
+				return false;
+			}
+			if (!is_type_simd_vector(x.type)) {
+				gbString s = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a simd vector type, got '%s'", LIT(builtin_name), s);
+				gb_string_free(s);
+				return false;
+			}
+
+			Type *elem = base_array_type(x.type);
+			if (!is_type_numeric(elem)) {
+				gbString s = type_to_string(x.type);
+				error(x.expr, "'%.*s' expected a simd vector type with a numeric element type, got '%s'", LIT(builtin_name), s);
+				gb_string_free(s);
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = x.type;
+			return true;
+		}
+
 	case BuiltinProc_simd_extract:
 		{
 			Operand x = {};
@@ -2059,6 +2089,7 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
 	case BuiltinProc_atomic_type_is_lock_free:
 	case BuiltinProc_has_target_feature:
 	case BuiltinProc_procedure_of:
+	case BuiltinProc_simd_indices:
 		// NOTE(bill): The first arg may be a Type, this will be checked case by case
 		break;

@@ -6001,12 +6032,13 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
 			
 			// NOTE(jakubtomsu): forces calculation of variant_block_size
 			type_size_of(u);
-			i64 tag_offset = u->Union.variant_block_size;
-			GB_ASSERT(tag_offset > 0);
+			// NOTE(Jeroen): A tag offset of zero is perfectly fine if all members of the union are empty structs.
+			//               What matters is that the tag size is > 0.
+			GB_ASSERT(u->Union.tag_size > 0);
 			
 			operand->mode = Addressing_Constant;
 			operand->type = t_untyped_integer;
-			operand->value = exact_value_i64(tag_offset);
+			operand->value = exact_value_i64(u->Union.variant_block_size);
 		}
 		break;

@@ -2910,9 +2910,20 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
 		if (!defined) {
 			gbString xs = type_to_string(x->type, temporary_allocator());
 			gbString ys = type_to_string(y->type, temporary_allocator());
-			err_str = gb_string_make(temporary_allocator(),
-				gb_bprintf("operator '%.*s' not defined between the types '%s' and '%s'", LIT(token_strings[op]), xs, ys)
-			);
+
+			if (!is_type_comparable(x->type)) {
+				err_str = gb_string_make(temporary_allocator(),
+					gb_bprintf("Type '%s' is not simply comparable, so operator '%.*s' is not defined for it", xs, LIT(token_strings[op]))
+				);
+			} else if (!is_type_comparable(y->type)) {
+				err_str = gb_string_make(temporary_allocator(),
+					gb_bprintf("Type '%s' is not simply comparable, so operator '%.*s' is not defined for it", ys, LIT(token_strings[op]))
+				);
+			} else {
+				err_str = gb_string_make(temporary_allocator(),
+					gb_bprintf("Operator '%.*s' not defined between the types '%s' and '%s'", LIT(token_strings[op]), xs, ys)
+				);
+			}
 		} else {
 			Type *comparison_type = x->type;
 			if (x->type == err_type && is_operand_nil(*x)) {
@@ -2933,11 +2944,11 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
 		} else {
 			yt = type_to_string(y->type);
 		}
-		err_str = gb_string_make(temporary_allocator(), gb_bprintf("mismatched types '%s' and '%s'", xt, yt));
+		err_str = gb_string_make(temporary_allocator(), gb_bprintf("Mismatched types '%s' and '%s'", xt, yt));
 	}

 	if (err_str != nullptr) {
-		error(node, "Cannot compare expression, %s", err_str);
+		error(node, "Cannot compare expression. %s.", err_str);
 		x->type = t_untyped_bool;
 	} else {
 		if (x->mode == Addressing_Constant &&
@@ -205,6 +205,9 @@ BuiltinProc__simd_begin,
 	BuiltinProc_simd_masked_expand_load,
 	BuiltinProc_simd_masked_compress_store,

+	BuiltinProc_simd_indices,
+
+
 	// Platform specific SIMD intrinsics
 	BuiltinProc_simd_x86__MM_SHUFFLE,
 BuiltinProc__simd_end,
@@ -551,6 +554,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("simd_masked_expand_load"),    3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_masked_compress_store"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},

+	{STR_LIT("simd_indices"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
 	{STR_LIT("simd_x86__MM_SHUFFLE"), 4, false, Expr_Expr, BuiltinProcPkg_intrinsics},

 	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -1293,6 +1293,23 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 	lbValue res = {};
 	res.type = tv.type;

+	switch (builtin_id) {
+	case BuiltinProc_simd_indices: {
+		Type *type = base_type(res.type);
+		GB_ASSERT(type->kind == Type_SimdVector);
+		Type *elem = type->SimdVector.elem;
+
+		i64 count = type->SimdVector.count;
+		LLVMValueRef *scalars = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+		for (i64 i = 0; i < count; i++) {
+			scalars[i] = lb_const_value(m, elem, exact_value_i64(i)).value;
+		}
+
+		res.value = LLVMConstVector(scalars, cast(unsigned)count);
+		return res;
+	}
+	}
+
 	lbValue arg0 = {}; if (ce->args.count > 0) arg0 = lb_build_expr(p, ce->args[0]);
 	lbValue arg1 = {}; if (ce->args.count > 1) arg1 = lb_build_expr(p, ce->args[1]);
 	lbValue arg2 = {}; if (ce->args.count > 2) arg2 = lb_build_expr(p, ce->args[2]);
@@ -4108,10 +4108,10 @@ gb_internal i64 type_size_of_internal(Type *t, TypePath *path) {
 		}

 		i64 max = 0;
-		i64 field_size = 0;

 		for_array(i, t->Union.variants) {
 			Type *variant_type = t->Union.variants[i];
+
 			i64 size = type_size_of_internal(variant_type, path);
 			if (max < size) {
 				max = size;
@@ -4130,7 +4130,7 @@ gb_internal i64 type_size_of_internal(Type *t, TypePath *path) {
 			size = align_formula(max, tag_size);
 			// NOTE(bill): Calculate the padding between the common fields and the tag
 			t->Union.tag_size = cast(i16)tag_size;
-			t->Union.variant_block_size = size - field_size;
+			t->Union.variant_block_size = size;

 			size += tag_size;
 		}