Merge branch 'master' into core-simd-indices-redadd-redmul

This commit is contained in:
Barinzaya
2025-05-05 16:37:02 -04:00
12 changed files with 174 additions and 515 deletions
+1 -1
View File
@@ -298,7 +298,7 @@ simd_masked_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)
simd_masked_expand_load :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
simd_masked_compress_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) where type_is_integer(U) || type_is_boolean(U) ---
simd_indices :: proc($T: typeid/#simd[$N]$E) -> T where type_is_numeric(T) ---
simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
simd_select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
+1 -1
View File
@@ -257,7 +257,7 @@ reader_read_rune :: proc(b: ^Reader) -> (r: rune, size: int, err: io.Error) {
for b.r+utf8.UTF_MAX > b.w &&
!utf8.full_rune(b.buf[b.r:b.w]) &&
b.err == nil &&
b.w-b.w < len(b.buf) {
b.w-b.r < len(b.buf) {
_reader_read_new_chunk(b) or_return
}
+31 -26
View File
@@ -12,7 +12,30 @@ import win32 "core:sys/windows"
INVALID_HANDLE :: ~uintptr(0)
S_IWRITE :: 0o200
// NOTE(Jeroen): We don't translate mode flags for Linux when given to `chmod`.
// Let's not do so for Windows for `chmod` or `read_directory_iterator` either.
// They're *not* portable between Windows and non-Windows platforms.
//
// It also leads to information loss as flags like Archive, Hidden and System have no equivalent there.
// We can of course parse them so we can set the `.Symlink` and `.Directory` type, but we shouldn't pretend
// that 0o644 is meaningful when returned as a mode.
// `C:\bootmgr` as an example has attributes read only, hidden, system, archive. In no way is it sensible to replace that with 0o444.
FILE_ATTRIBUTE_READONLY :: win32.FILE_ATTRIBUTE_READONLY // 0x00000001
FILE_ATTRIBUTE_HIDDEN :: win32.FILE_ATTRIBUTE_HIDDEN // 0x00000002
FILE_ATTRIBUTE_SYSTEM :: win32.FILE_ATTRIBUTE_SYSTEM // 0x00000004
FILE_ATTRIBUTE_DIRECTORY :: win32.FILE_ATTRIBUTE_DIRECTORY // 0x00000010
FILE_ATTRIBUTE_ARCHIVE :: win32.FILE_ATTRIBUTE_ARCHIVE // 0x00000020
FILE_ATTRIBUTE_DEVICE :: win32.FILE_ATTRIBUTE_DEVICE // 0x00000040
FILE_ATTRIBUTE_NORMAL :: win32.FILE_ATTRIBUTE_NORMAL // 0x00000080
FILE_ATTRIBUTE_TEMPORARY :: win32.FILE_ATTRIBUTE_TEMPORARY // 0x00000100
FILE_ATTRIBUTE_SPARSE_FILE :: win32.FILE_ATTRIBUTE_SPARSE_FILE // 0x00000200
FILE_ATTRIBUTE_REPARSE_Point :: win32.FILE_ATTRIBUTE_REPARSE_Point // 0x00000400
FILE_ATTRIBUTE_REPARSE_POINT :: win32.FILE_ATTRIBUTE_REPARSE_POINT // 0x00000400
FILE_ATTRIBUTE_COMPRESSED :: win32.FILE_ATTRIBUTE_COMPRESSED // 0x00000800
FILE_ATTRIBUTE_OFFLINE :: win32.FILE_ATTRIBUTE_OFFLINE // 0x00001000
FILE_ATTRIBUTE_NOT_CONTENT_INDEXED :: win32.FILE_ATTRIBUTE_NOT_CONTENT_INDEXED // 0x00002000
FILE_ATTRIBUTE_ENCRYPTED :: win32.FILE_ATTRIBUTE_ENCRYPTED // 0x00004000
_ERROR_BAD_NETPATH :: 53
MAX_RW :: 1<<30
@@ -122,7 +145,7 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: int) -> (handle: u
}
attrs: u32 = win32.FILE_ATTRIBUTE_NORMAL|win32.FILE_FLAG_BACKUP_SEMANTICS
if perm & S_IWRITE == 0 {
if u32(perm) & FILE_ATTRIBUTE_NORMAL == 0 {
attrs = win32.FILE_ATTRIBUTE_READONLY
if create_mode == win32.CREATE_ALWAYS {
// NOTE(bill): Open has just asked to create a file in read-only mode.
@@ -748,20 +771,10 @@ _fchmod :: proc(f: ^File, mode: int) -> Error {
if f == nil || f.impl == nil {
return nil
}
d: win32.BY_HANDLE_FILE_INFORMATION
if !win32.GetFileInformationByHandle(_handle(f), &d) {
return _get_platform_error()
}
attrs := d.dwFileAttributes
if mode & S_IWRITE != 0 {
attrs &~= win32.FILE_ATTRIBUTE_READONLY
} else {
attrs |= win32.FILE_ATTRIBUTE_READONLY
}
info: win32.FILE_BASIC_INFO
info.FileAttributes = attrs
if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(d)) {
info.FileAttributes = win32.DWORD(mode)
if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(info)) {
return _get_platform_error()
}
return nil
@@ -800,19 +813,11 @@ _chtimes :: proc(name: string, atime, mtime: time.Time) -> Error {
defer close(f)
return _fchtimes(f, atime, mtime)
}
_fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
if f == nil || f.impl == nil {
return nil
}
d: win32.BY_HANDLE_FILE_INFORMATION
if !win32.GetFileInformationByHandle(_handle(f), &d) {
return _get_platform_error()
}
to_windows_time :: #force_inline proc(t: time.Time) -> win32.LARGE_INTEGER {
// a 64-bit value representing the number of 100-nanosecond intervals since January 1, 1601 (UTC)
return win32.LARGE_INTEGER(time.time_to_unix_nano(t) * 100 + 116444736000000000)
}
atime, mtime := atime, mtime
if time.time_to_unix_nano(atime) < time.time_to_unix_nano(mtime) {
@@ -820,9 +825,9 @@ _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
}
info: win32.FILE_BASIC_INFO
info.LastAccessTime = to_windows_time(atime)
info.LastWriteTime = to_windows_time(mtime)
if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(d)) {
info.LastAccessTime = time_as_filetime(atime)
info.LastWriteTime = time_as_filetime(mtime)
if !win32.SetFileInformationByHandle(_handle(f), .FileBasicInfo, &info, size_of(info)) {
return _get_platform_error()
}
return nil
+27 -15
View File
@@ -162,7 +162,7 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
}
}
cmdline_if: if selection & {.Working_Dir, .Command_Line, .Command_Args, .Executable_Path} != {} {
cmdline_if: if selection & {.Working_Dir, .Command_Line, .Command_Args} != {} {
strings.builder_reset(&path_builder)
strings.write_string(&path_builder, "/proc/")
strings.write_int(&path_builder, pid)
@@ -178,12 +178,12 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
terminator := strings.index_byte(cmdline, 0)
assert(terminator > 0)
command_line_exec := cmdline[:terminator]
// command_line_exec := cmdline[:terminator]
// Still need cwd if the execution on the command line is relative.
cwd: string
cwd_err: Error
if .Working_Dir in selection || (.Executable_Path in selection && command_line_exec[0] != '/') {
if .Working_Dir in selection {
strings.builder_reset(&path_builder)
strings.write_string(&path_builder, "/proc/")
strings.write_int(&path_builder, pid)
@@ -199,18 +199,6 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
}
}
if .Executable_Path in selection {
if cmdline[0] == '/' {
info.executable_path = strings.clone(cmdline[:terminator], allocator) or_return
info.fields += {.Executable_Path}
} else if cwd_err == nil {
info.executable_path = join_path({ cwd, cmdline[:terminator] }, allocator) or_return
info.fields += {.Executable_Path}
} else {
break cmdline_if
}
}
if selection & {.Command_Line, .Command_Args} != {} {
// skip to first arg
//cmdline = cmdline[terminator + 1:]
@@ -323,6 +311,30 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
}
}
if .Executable_Path in selection {
/*
NOTE(Jeroen):
The old version returned the wrong executable path for things like `bash` or `sh`,
for whom `/proc/<pid>/cmdline` will just report "bash" or "sh",
resulting in misleading paths like `$PWD/sh`, even though that executable doesn't exist there.
Thanks to Yawning for suggesting `/proc/self/exe`.
*/
strings.builder_reset(&path_builder)
strings.write_string(&path_builder, "/proc/")
strings.write_int(&path_builder, pid)
strings.write_string(&path_builder, "/exe")
if exe_bytes, exe_err := _read_link(strings.to_string(path_builder), temp_allocator()); exe_err == nil {
info.executable_path = strings.clone(string(exe_bytes), allocator) or_return
info.fields += {.Executable_Path}
} else {
err = exe_err
}
}
if .Environment in selection {
strings.builder_reset(&path_builder)
strings.write_string(&path_builder, "/proc/")
+34 -15
View File
@@ -212,11 +212,15 @@ _file_type_from_create_file :: proc(wname: win32.wstring, create_file_attributes
}
_file_type_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: win32.HANDLE, ReparseTag: win32.DWORD) -> (type: File_Type, mode: int) {
if file_attributes & win32.FILE_ATTRIBUTE_READONLY != 0 {
mode |= 0o444
} else {
mode |= 0o666
}
// NOTE(Jeroen): We don't translate mode flags for Linux when given to `chmod`.
// Let's not do so for Windows for `chmod` or `read_directory_iterator` either.
// They're *not* portable between Windows and non-Windows platforms.
//
// It also leads to information loss as flags like Archive, Hidden and System have no equivalent there.
// We can of course parse them so we can set the `.Symlink` and `.Directory` type, but we shouldn't pretend
// that 0o644 is meaningful when returned as a mode.
// `C:\bootmgr` as an example has attributes read only, hidden, system, archive. In no way is it sensible to replace that with 0o444.
mode = int(file_attributes)
is_sym := false
if file_attributes & win32.FILE_ATTRIBUTE_REPARSE_POINT == 0 {
@@ -229,21 +233,36 @@ _file_type_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: wi
type = .Symlink
} else if file_attributes & win32.FILE_ATTRIBUTE_DIRECTORY != 0 {
type = .Directory
mode |= 0o111
} else if h != nil {
type = file_type(h)
}
return
}
// a 64-bit value representing the number of 100-nanosecond intervals since January 1, 1601 (UTC)
time_as_filetime :: #force_inline proc(t: time.Time) -> (ft: win32.LARGE_INTEGER) {
win := u64(t._nsec / 100) + 116444736000000000
return win32.LARGE_INTEGER(win)
}
filetime_as_time_li :: #force_inline proc(ft: win32.LARGE_INTEGER) -> (t: time.Time) {
return {_nsec=(i64(ft) - 116444736000000000) * 100}
}
filetime_as_time_ft :: #force_inline proc(ft: win32.FILETIME) -> (t: time.Time) {
return filetime_as_time_li(win32.LARGE_INTEGER(ft.dwLowDateTime) + win32.LARGE_INTEGER(ft.dwHighDateTime) << 32)
}
filetime_as_time :: proc{filetime_as_time_ft, filetime_as_time_li}
_file_info_from_win32_file_attribute_data :: proc(d: ^win32.WIN32_FILE_ATTRIBUTE_DATA, name: string, allocator: runtime.Allocator) -> (fi: File_Info, e: Error) {
fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
fi.type = type
fi.mode |= mode
fi.creation_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
fi.access_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
fi.creation_time = filetime_as_time(d.ftCreationTime)
fi.modification_time = filetime_as_time(d.ftLastWriteTime)
fi.access_time = filetime_as_time(d.ftLastAccessTime)
fi.fullpath, e = full_path_from_name(name, allocator)
fi.name = basename(fi.fullpath)
return
@@ -254,9 +273,9 @@ _file_info_from_win32_find_data :: proc(d: ^win32.WIN32_FIND_DATAW, name: string
type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
fi.type = type
fi.mode |= mode
fi.creation_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
fi.access_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
fi.creation_time = filetime_as_time(d.ftCreationTime)
fi.modification_time = filetime_as_time(d.ftLastWriteTime)
fi.access_time = filetime_as_time(d.ftLastAccessTime)
fi.fullpath, e = full_path_from_name(name, allocator)
fi.name = basename(fi.fullpath)
return
@@ -286,9 +305,9 @@ _file_info_from_get_file_information_by_handle :: proc(path: string, h: win32.HA
type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, h, 0)
fi.type = type
fi.mode |= mode
fi.creation_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
fi.access_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
fi.creation_time = filetime_as_time(d.ftCreationTime)
fi.modification_time = filetime_as_time(d.ftLastWriteTime)
fi.access_time = filetime_as_time(d.ftLastAccessTime)
return fi, nil
}
+3 -447
View File
@@ -1759,7 +1759,7 @@ Returns:
replace :: intrinsics.simd_replace
/*
Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
Reduce a vector to a scalar by adding up all the lanes.
This procedure returns a scalar that is the ordered sum of all lanes. The
ordered sum may be important for accounting for precision errors in
@@ -2511,460 +2511,16 @@ recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where int
return T(1) / v
}
/*
Create a vector where each lane contains the index of that lane.
Inputs:
- `V`: The type of the vector to create.
Result:
- A vector of the given type, where each lane contains the index of that lane.
**Operation**:
for i in 0 ..< N {
res[i] = i
}
*/
indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
when N == 1 {
return {0}
} else when N == 2 {
return {0, 1}
} else when N == 4 {
return {0, 1, 2, 3}
} else when N == 8 {
return {0, 1, 2, 3, 4, 5, 6, 7}
} else when N == 16 {
return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
} else when N == 32 {
return {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
}
} else when N == 64 {
return {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
}
} else {
#panic("Unsupported vector size!")
}
}
/*
Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
This procedure returns a scalar that is the sum of all lanes, calculated by
adding each even-indexed element with the following odd-indexed element to
produce N/2 values. This is repeated until only a single element remains. This
order is supported by hardware instructions for some types/architectures (e.g.
i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
The order of the sum may be important for accounting for precision errors in
floating-point computation, as floating-point addition is not associative, that
is `(a+b)+c` may not be equal to `a+(b+c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Sum of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] = a[2*i+0] + a[2*i+1]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
v: | v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
`>[+]<' `>[+]<'
| |
`--->[+]<--'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
where intrinsics.type_is_numeric(E) {
when N == 64 { v64 := v }
when N == 32 { v32 := v }
when N == 16 { v16 := v }
when N == 8 { v8 := v }
when N == 4 { v4 := v }
when N == 2 { v2 := v }
when N >= 64 {
x32 := swizzle(v64,
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62)
y32 := swizzle(v64,
1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31,
33, 35, 37, 39, 41, 43, 45, 47,
49, 51, 53, 55, 57, 59, 61, 63)
v32 := x32 + y32
}
when N >= 32 {
x16 := swizzle(v32,
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30)
y16 := swizzle(v32,
1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31)
v16 := x16 + y16
}
when N >= 16 {
x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
v8 := x8 + y8
}
when N >= 8 {
x4 := swizzle(v8, 0, 2, 4, 6)
y4 := swizzle(v8, 1, 3, 5, 7)
v4 := x4 + y4
}
when N >= 4 {
x2 := swizzle(v4, 0, 2)
y2 := swizzle(v4, 1, 3)
v2 := x2 + y2
}
when N >= 2 {
return extract(v2, 0) + extract(v2, 1)
} else {
return extract(v, 0)
}
}
/*
Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
This procedure returns a scalar that is the sum of all lanes, calculated by
bisecting the vector into two parts, where the first contains lanes [0, N/2)
and the second contains lanes [N/2, N), and adding the two halves element-wise
to produce N/2 values. This is repeated until only a single element remains.
This order may be faster to compute than the ordered sum for floats, as it can
often be better parallelized.
The order of the sum may be important for accounting for precision errors in
floating-point computation, as floating-point addition is not associative, that
is `(a+b)+c` may not be equal to `a+(b+c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Sum of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] += a[i+n]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
| v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
[+]<-- | ---' |
| [+]<--------'
| |
`>[+]<'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_add_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
where intrinsics.type_is_numeric(E) {
when N == 64 { v64 := v }
when N == 32 { v32 := v }
when N == 16 { v16 := v }
when N == 8 { v8 := v }
when N == 4 { v4 := v }
when N == 2 { v2 := v }
when N >= 64 {
x32 := swizzle(v64,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31)
y32 := swizzle(v64,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63)
v32 := x32 + y32
}
when N >= 32 {
x16 := swizzle(v32,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15)
y16 := swizzle(v32,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31)
v16 := x16 + y16
}
when N >= 16 {
x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
v8 := x8 + y8
}
when N >= 8 {
x4 := swizzle(v8, 0, 1, 2, 3)
y4 := swizzle(v8, 4, 5, 6, 7)
v4 := x4 + y4
}
when N >= 4 {
x2 := swizzle(v4, 0, 1)
y2 := swizzle(v4, 2, 3)
v2 := x2 + y2
}
when N >= 2 {
return extract(v2, 0) + extract(v2, 1)
} else {
return extract(v, 0)
}
}
/*
Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
This procedure returns a scalar that is the product of all lanes, calculated by
bisecting the vector into two parts, where the first contains lanes [0, N/2)
and the second contains lanes [N/2, N), and multiplying the two halves together
multiplying each even-indexed element with the following odd-indexed element to
produce N/2 values. This is repeated until only a single element remains. This
order may be faster to compute than the ordered product for floats, as it can
often be better parallelized.
The order of the product may be important for accounting for precision errors
in floating-point computation, as floating-point multiplication is not
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Product of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] = a[2*i+0] * a[2*i+1]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
v: | v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
`>[x]<' `>[x]<'
| |
`--->[x]<--'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
where intrinsics.type_is_numeric(E) {
when N == 64 { v64 := v }
when N == 32 { v32 := v }
when N == 16 { v16 := v }
when N == 8 { v8 := v }
when N == 4 { v4 := v }
when N == 2 { v2 := v }
when N >= 64 {
x32 := swizzle(v64,
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62)
y32 := swizzle(v64,
1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31,
33, 35, 37, 39, 41, 43, 45, 47,
49, 51, 53, 55, 57, 59, 61, 63)
v32 := x32 * y32
}
when N >= 32 {
x16 := swizzle(v32,
0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30)
y16 := swizzle(v32,
1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31)
v16 := x16 * y16
}
when N >= 16 {
x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
v8 := x8 * y8
}
when N >= 8 {
x4 := swizzle(v8, 0, 2, 4, 6)
y4 := swizzle(v8, 1, 3, 5, 7)
v4 := x4 * y4
}
when N >= 4 {
x2 := swizzle(v4, 0, 2)
y2 := swizzle(v4, 1, 3)
v2 := x2 * y2
}
when N >= 2 {
return extract(v2, 0) * extract(v2, 1)
} else {
return extract(v, 0)
}
}
/*
Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
This procedure returns a scalar that is the product of all lanes, calculated by
bisecting the vector into two parts, where the first contains indices [0, N/2)
and the second contains indices [N/2, N), and multiplying the two halves
together element-wise to produce N/2 values. This is repeated until only a
single element remains. This order may be faster to compute than the ordered
product for floats, as it can often be better parallelized.
The order of the product may be important for accounting for precision errors
in floating-point computation, as floating-point multiplication is not
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Product of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] *= a[i+n]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
| v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
[x]<-- | ---' |
| [x]<--------'
| |
`>[x]<'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_mul_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
where intrinsics.type_is_numeric(E) {
when N == 64 { v64 := v }
when N == 32 { v32 := v }
when N == 16 { v16 := v }
when N == 8 { v8 := v }
when N == 4 { v4 := v }
when N == 2 { v2 := v }
when N >= 64 {
x32 := swizzle(v64,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31)
y32 := swizzle(v64,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63)
v32 := x32 * y32
}
when N >= 32 {
x16 := swizzle(v32,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15)
y16 := swizzle(v32,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31)
v16 := x16 * y16
}
when N >= 16 {
x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7)
y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
v8 := x8 * y8
}
when N >= 8 {
x4 := swizzle(v8, 0, 1, 2, 3)
y4 := swizzle(v8, 4, 5, 6, 7)
v4 := x4 * y4
}
when N >= 4 {
x2 := swizzle(v4, 0, 1)
y2 := swizzle(v4, 2, 3)
v2 := x2 * y2
}
when N >= 2 {
return extract(v2, 0) * extract(v2, 1)
} else {
return extract(v, 0)
}
}
indices :: intrinsics.simd_indices
+2
View File
@@ -47,6 +47,8 @@ foreign user32 {
lpParam: LPVOID,
) -> HWND ---
GetWindowThreadProcessId :: proc(hwnd: HWND, lpdwProcessId: LPDWORD) -> DWORD ---
DestroyWindow :: proc(hWnd: HWND) -> BOOL ---
ShowWindow :: proc(hWnd: HWND, nCmdShow: INT) -> BOOL ---
+35 -3
View File
@@ -760,6 +760,36 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
return true;
}
case BuiltinProc_simd_indices:
{
Operand x = {};
check_expr_or_type(c, &x, ce->args[0], nullptr);
if (x.mode == Addressing_Invalid) return false;
if (x.mode != Addressing_Type) {
gbString s = expr_to_string(x.expr);
error(x.expr, "'%.*s' expected a simd vector type, got '%s'", LIT(builtin_name), s);
gb_string_free(s);
return false;
}
if (!is_type_simd_vector(x.type)) {
gbString s = type_to_string(x.type);
error(x.expr, "'%.*s' expected a simd vector type, got '%s'", LIT(builtin_name), s);
gb_string_free(s);
return false;
}
Type *elem = base_array_type(x.type);
if (!is_type_numeric(elem)) {
gbString s = type_to_string(x.type);
error(x.expr, "'%.*s' expected a simd vector type with a numeric element type, got '%s'", LIT(builtin_name), s);
gb_string_free(s);
}
operand->mode = Addressing_Value;
operand->type = x.type;
return true;
}
case BuiltinProc_simd_extract:
{
Operand x = {};
@@ -2059,6 +2089,7 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
case BuiltinProc_atomic_type_is_lock_free:
case BuiltinProc_has_target_feature:
case BuiltinProc_procedure_of:
case BuiltinProc_simd_indices:
// NOTE(bill): The first arg may be a Type, this will be checked case by case
break;
@@ -6001,12 +6032,13 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
// NOTE(jakubtomsu): forces calculation of variant_block_size
type_size_of(u);
i64 tag_offset = u->Union.variant_block_size;
GB_ASSERT(tag_offset > 0);
// NOTE(Jeroen): A tag offset of zero is perfectly fine if all members of the union are empty structs.
// What matters is that the tag size is > 0.
GB_ASSERT(u->Union.tag_size > 0);
operand->mode = Addressing_Constant;
operand->type = t_untyped_integer;
operand->value = exact_value_i64(tag_offset);
operand->value = exact_value_i64(u->Union.variant_block_size);
}
break;
+16 -5
View File
@@ -2910,9 +2910,20 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
if (!defined) {
gbString xs = type_to_string(x->type, temporary_allocator());
gbString ys = type_to_string(y->type, temporary_allocator());
err_str = gb_string_make(temporary_allocator(),
gb_bprintf("operator '%.*s' not defined between the types '%s' and '%s'", LIT(token_strings[op]), xs, ys)
);
if (!is_type_comparable(x->type)) {
err_str = gb_string_make(temporary_allocator(),
gb_bprintf("Type '%s' is not simply comparable, so operator '%.*s' is not defined for it", xs, LIT(token_strings[op]))
);
} else if (!is_type_comparable(y->type)) {
err_str = gb_string_make(temporary_allocator(),
gb_bprintf("Type '%s' is not simply comparable, so operator '%.*s' is not defined for it", ys, LIT(token_strings[op]))
);
} else {
err_str = gb_string_make(temporary_allocator(),
gb_bprintf("Operator '%.*s' not defined between the types '%s' and '%s'", LIT(token_strings[op]), xs, ys)
);
}
} else {
Type *comparison_type = x->type;
if (x->type == err_type && is_operand_nil(*x)) {
@@ -2933,11 +2944,11 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
} else {
yt = type_to_string(y->type);
}
err_str = gb_string_make(temporary_allocator(), gb_bprintf("mismatched types '%s' and '%s'", xt, yt));
err_str = gb_string_make(temporary_allocator(), gb_bprintf("Mismatched types '%s' and '%s'", xt, yt));
}
if (err_str != nullptr) {
error(node, "Cannot compare expression, %s", err_str);
error(node, "Cannot compare expression. %s.", err_str);
x->type = t_untyped_bool;
} else {
if (x->mode == Addressing_Constant &&
+5
View File
@@ -205,6 +205,9 @@ BuiltinProc__simd_begin,
BuiltinProc_simd_masked_expand_load,
BuiltinProc_simd_masked_compress_store,
BuiltinProc_simd_indices,
// Platform specific SIMD intrinsics
BuiltinProc_simd_x86__MM_SHUFFLE,
BuiltinProc__simd_end,
@@ -551,6 +554,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("simd_masked_expand_load"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_masked_compress_store"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_indices"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_x86__MM_SHUFFLE"), 4, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+17
View File
@@ -1293,6 +1293,23 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
lbValue res = {};
res.type = tv.type;
switch (builtin_id) {
case BuiltinProc_simd_indices: {
Type *type = base_type(res.type);
GB_ASSERT(type->kind == Type_SimdVector);
Type *elem = type->SimdVector.elem;
i64 count = type->SimdVector.count;
LLVMValueRef *scalars = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
for (i64 i = 0; i < count; i++) {
scalars[i] = lb_const_value(m, elem, exact_value_i64(i)).value;
}
res.value = LLVMConstVector(scalars, cast(unsigned)count);
return res;
}
}
lbValue arg0 = {}; if (ce->args.count > 0) arg0 = lb_build_expr(p, ce->args[0]);
lbValue arg1 = {}; if (ce->args.count > 1) arg1 = lb_build_expr(p, ce->args[1]);
lbValue arg2 = {}; if (ce->args.count > 2) arg2 = lb_build_expr(p, ce->args[2]);
+2 -2
View File
@@ -4108,10 +4108,10 @@ gb_internal i64 type_size_of_internal(Type *t, TypePath *path) {
}
i64 max = 0;
i64 field_size = 0;
for_array(i, t->Union.variants) {
Type *variant_type = t->Union.variants[i];
i64 size = type_size_of_internal(variant_type, path);
if (max < size) {
max = size;
@@ -4130,7 +4130,7 @@ gb_internal i64 type_size_of_internal(Type *t, TypePath *path) {
size = align_formula(max, tag_size);
// NOTE(bill): Calculate the padding between the common fields and the tag
t->Union.tag_size = cast(i16)tag_size;
t->Union.variant_block_size = size - field_size;
t->Union.variant_block_size = size;
size += tag_size;
}