/*
The SIMD support package.

SIMD (Single Instruction Multiple Data), is a CPU hardware feature that
introduce special registers and instructions which operate on multiple units
of data at the same time, , which enables faster data processing for
applications with heavy computational workloads.

In Odin SIMD is exposed via a special kinds of arrays, called the *SIMD
vectors*. The types of SIMD vectors is written as `#simd [N]T`, where N is a
power of two, and T could be any basic type (integers, floats, etc.). The
documentation of this package will call *SIMD vectors* just *vectors*.

The elements of SIMD vectors consist of elements, called *scalar values*, or
*scalars*, each occupying a *lane* of the SIMD vector.

This package implements procedures for working with vectors.
*/
package simd

import "base:builtin"
import "base:intrinsics"

/*
Check if SIMD is emulated on a target platform.

This value is `true`, if the compile-time target has the hardware support for
at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
for 128-bit SIMD, this value is `false`, and all SIMD operations will be
emulated.
*/
IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
	true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
	true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
	true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
	false

/*
Vector of 16 `u8` lanes (128 bits).
*/
u8x16 :: #simd[16]u8

/*
Vector of 16 `i8` lanes (128 bits).
*/
i8x16 :: #simd[16]i8

/*
Vector of 8 `u16` lanes (128 bits).
*/
u16x8 :: #simd[8]u16

/*
Vector of 8 `i16` lanes (128 bits).
*/
i16x8 :: #simd[8]i16

/*
Vector of 4 `u32` lanes (128 bits).
*/
u32x4 :: #simd[4]u32

/*
Vector of 4 `i32` lanes (128 bits).
*/
i32x4 :: #simd[4]i32

/*
Vector of 2 `u64` lanes (128 bits).
*/
u64x2 :: #simd[2]u64

/*
Vector of 2 `i64` lanes (128 bits).
*/
i64x2 :: #simd[2]i64

/*
Vector of 4 `f32` lanes (128 bits).
*/
f32x4 :: #simd[4]f32

/*
Vector of 2 `f64` lanes (128 bits).
*/
f64x2 :: #simd[2]f64

/*
Vector of 16 `bool` lanes (128 bits).
*/
boolx16 :: #simd[16]bool

/*
Vector of 16 `b8` lanes (128 bits).
*/
b8x16   :: #simd[16]b8

/*
Vector of 8 `b16` lanes (128 bits).
*/
b16x8   :: #simd[8]b16

/*
Vector of 4 `b32` lanes (128 bits).
*/
b32x4   :: #simd[4]b32

/*
Vector of 2 `b64` lanes (128 bits).
*/
b64x2   :: #simd[2]b64

/*
Vector of 32 `u8` lanes (256 bits).
*/
u8x32  :: #simd[32]u8

/*
Vector of 32 `i8` lanes (256 bits).
*/
i8x32  :: #simd[32]i8

/*
Vector of 16 `u16` lanes (256 bits).
*/
u16x16 :: #simd[16]u16

/*
Vector of 16 `i16` lanes (256 bits).
*/
i16x16 :: #simd[16]i16

/*
Vector of 8 `u32` lanes (256 bits).
*/
u32x8  :: #simd[8]u32

/*
Vector of 8 `i32` lanes (256 bits).
*/
i32x8  :: #simd[8]i32

/*
Vector of 4 `u64` lanes (256 bits).
*/
u64x4  :: #simd[4]u64

/*
Vector of 4 `i64` lanes (256 bits).
*/
i64x4  :: #simd[4]i64

/*
Vector of 8 `f32` lanes (256 bits).
*/
f32x8  :: #simd[8]f32

/*
Vector of 4 `f64` lanes (256 bits).
*/
f64x4  :: #simd[4]f64

/*
Vector of 32 `bool` lanes (256 bits).
*/
boolx32 :: #simd[32]bool

/*
Vector of 32 `b8` lanes (256 bits).
*/
b8x32   :: #simd[32]b8

/*
Vector of 16 `b16` lanes (256 bits).
*/
b16x16  :: #simd[16]b16

/*
Vector of 8 `b32` lanes (256 bits).
*/
b32x8   :: #simd[8]b32

/*
Vector of 4 `b64` lanes (256 bits).
*/
b64x4   :: #simd[4]b64

/*
Vector of 64 `u8` lanes (512 bits).
*/
u8x64  :: #simd[64]u8

/*
Vector of 64 `i8` lanes (512 bits).
*/
i8x64  :: #simd[64]i8

/*
Vector of 32 `u16` lanes (512 bits).
*/
u16x32 :: #simd[32]u16

/*
Vector of 32 `i16` lanes (512 bits).
*/
i16x32 :: #simd[32]i16

/*
Vector of 16 `u32` lanes (512 bits).
*/
u32x16 :: #simd[16]u32

/*
Vector of 16 `i32` lanes (512 bits).
*/
i32x16 :: #simd[16]i32

/*
Vector of 8 `u64` lanes (512 bits).
*/
u64x8  :: #simd[8]u64

/*
Vector of 8 `i64` lanes (512 bits).
*/
i64x8  :: #simd[8]i64

/*
Vector of 16 `f32` lanes (512 bits).
*/
f32x16 :: #simd[16]f32

/*
Vector of 8 `f64` lanes (512 bits).
*/
f64x8  :: #simd[8]f64

/*
Vector of 64 `bool` lanes (512 bits).
*/
boolx64 :: #simd[64]bool

/*
Vector of 64 `b8` lanes (512 bits).
*/
b8x64   :: #simd[64]b8

/*
Vector of 32 `b16` lanes (512 bits).
*/
b16x32  :: #simd[32]b16

/*
Vector of 16 `b32` lanes (512 bits).
*/
b32x16  :: #simd[16]b32

/*
Vector of 8 `b64` lanes (512 bits).
*/
b64x8   :: #simd[8]b64

/*
Add SIMD vectors.

This procedure returns a vector, where each lane holds the sum of the
corresponding `a` and `b` vectors' lanes.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- The sum of two vectors.

**Operation**:
	
	for i in 0 ..< len(res) {
		res[i] = a[i] + b[i]
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  0  |  1  |  2  |  3  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  |  1  |  2  | -1  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  0  |  2  |  4  |  2  |
	   +-----+-----+-----+-----+
*/
add :: intrinsics.simd_add

/*
Subtract SIMD vectors.

This procedure returns a vector, where each lane holds the difference between
the corresponding lanes of the vectors `a` and `b`. The lanes from the vector
`b` are subtracted from the corresponding lanes of the vector `a`.

Inputs:
- `a`: Integer or a float vector to subtract from.
- `b`: Integer or a float vector.

Returns:
- The difference of two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] - b[i]
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  2  |  2  |  2  |  2  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  |  1  |  2  |  3  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  2  |  1  |  0  | -1  |
	   +-----+-----+-----+-----+
*/
sub :: intrinsics.simd_sub

/*
Multiply (component-wise) SIMD vectors.

This procedure returns a vector, where each lane holds the product of the
corresponding lanes of the vectors `a` and `b`.

Inputs:
- `a`: Integer or a float vector.
- `b`: Integer or a float vector.

Returns:
- The product of two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] * b[i]
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  2  |  2  |  2  |  2  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  | -1  |  2  | -3  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  0  | -2  |  4  | -6  |
	   +-----+-----+-----+-----+
*/
mul :: intrinsics.simd_mul

/*
Divide SIMD vectors.

This procedure returns a vector, where each lane holds the quotient (result
of division) between the corresponding lanes of the vectors `a` and `b`. Each
lane of the vector `a` is divided by the corresponding lane of the vector `b`.

This operation performs a standard floating-point division for each lane.

Inputs:
- `a`: Float vector.
- `b`: Float vector to divide by.

Returns:
- The quotient of two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] / b[i]
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  2  |  2  |  2  |  2  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  | -1  |  2  | -3  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-------+
	   | +∞  | -2  |  1  | -0.66 |
	   +-----+-----+-----+-------+
*/
div :: intrinsics.simd_div

/*
Shift left lanes of a vector.

This procedure returns a vector, such that each lane holds the result of a
shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
amount from the corresponding lane of the vector `b`.

If the shift amount is greater than the bit-width of a lane, the result is `0`
in the corresponding positions of the result.

Inputs:
- `a`: An integer vector of values to shift.
- `b`: An unsigned integer vector of the shift amounts.

Result:
- Shifted vector.

**Operation**:

	for i in 0 ..< len(res) {
		if b[i] < 8*size_of(a[i]) {
			res[i] = a[i] << b[i]
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |  0x11 |  0x55 |  0x03 |  0xff |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   2   |   1   |   33  |   1   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+--------+
	   |  0x44 |  0xaa |   0   |  0xfe  |
	   +-------+-------+-------+--------+
*/
shl :: intrinsics.simd_shl

/*
Shift right lanes of a vector.

This procedure returns a vector, such that each lane holds the result of a
shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
amount from the corresponding lane of the vector `b`.

If the shift amount is greater than the bit-width of a lane, the result is `0`
in the corresponding positions of the result.

If the first vector is a vector of signed integers, the arithmetic shift
operation is performed. Otherwise, if the first vector is a vector of unsigned
integers, a logical shift is performed.

Inputs:
- `a`: An integer vector of values to shift.
- `b`: An unsigned integer vector of the shift amounts.

Result:
- Shifted vector.

**Operation**:

	for i in 0 ..< len(res) {
		if b[i] < 8*size_of(a[i]) {
			res[i] = a[i] >> b[i]
		} else {
			res[i] = 0
		}
	}
	return res

Example:

This example assumes that the `a` vector is of a signed type.

	   +-------+-------+-------+-------+
	a: |  0x11 |  0x55 |  0x03 |  0xff |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   2   |   1   |   33  |   1   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+--------+
	   |  0x04 |  0x2a |   0   |  0xff  |
	   +-------+-------+-------+--------+
*/
shr :: intrinsics.simd_shr

/*
Shift left lanes of a vector (masked).

This procedure returns a vector, such that each lane holds the result of a
shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
amount from the corresponding lane of the vector `b`.

The shift amount is rounded to (masked) to the bit-width of the lane.

Inputs:
- `a`: An integer vector of values to shift.
- `b`: An unsigned integer vector of the shift amounts.

Result:
- Shifted vector.

**Operation**:

	for i in 0 ..< len(res) {
		mask := 8*size_of(a[i]) - 1
		res[i] = a[i] << (b[i] & mask)
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |  0x11 |  0x55 |  0x03 |  0xff |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   2   |   1   |   33  |   1   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+--------+
	   |  0x44 |  0xaa |  0x06 |  0xfe  |
	   +-------+-------+-------+--------+
*/
shl_masked :: intrinsics.simd_shl_masked

/*
Shift right lanes of a vector (masked).

This procedure returns a vector, such that each lane holds the result of a
shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
amount from the corresponding lane of the vector `b`.

The shift amount is rounded to (masked) to the bit-width of the lane.

If the first vector is a vector of signed integers, the arithmetic shift
operation is performed. Otherwise, if the first vector is a vector of unsigned
integers, a logical shift is performed.

Inputs:
- `a`: An integer vector of values to shift.
- `b`: An unsigned integer vector of the shift amounts.

Result:
- Shifted vector.

**Operation**:

	for i in 0 ..< len(res) {
		mask := 8*size_of(a[i]) - 1
		res[i] = a[i] >> (b[i] & mask)
	}
	return res

Example:

This example assumes that the `a` vector is of a signed type.

	   +-------+-------+-------+-------+
	a: |  0x11 |  0x55 |  0x03 |  0xff |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   2   |   1   |   33  |   1   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+--------+
	   |  0x04 |  0x2a |  0x01 |  0xff  |
	   +-------+-------+-------+--------+
*/
shr_masked :: intrinsics.simd_shr_masked

/*
Saturated addition of SIMD vectors.

The *saturated sum* is a sum, that upon overflow or underflow, instead of
round-tripping, keeps the value clamped between the minimum and the maximum
values of the lane type.

This procedure returns a vector where each lane is the saturated sum of the
corresponding lanes of vectors `a` and `b`.

Inputs:
- `a`: An integer vector.
- `b`: An integer vector.

Returns:
- Saturated sum of the two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		switch {
		case b[i] >= max(type_of(a[i])) - a[i]: // (overflow of a[i])
			res[i] = max(type_of(a[i]))
		case b[i] <= min(type_of(a[i])) - a[i]: // (underflow of a[i])
			res[i] = min(type_of(a[i]))
		} else {
			res[i] = a[i] + b[i]
		}
	}
	return res

Example:

Assuming unsigned bytes as the type of the element in a lane:

	   +-----+-----+-----+-----+
	a: |  0  | 255 |  2  |  3  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  1  |  3  |  2  | -1  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  1  | 255 |  4  |  2  |
	   +-----+-----+-----+-----+
*/
saturating_add :: intrinsics.simd_saturating_add

/*
Saturated subtraction of 2 lanes of vectors.

The *saturated difference* is a difference, that upon overflow or underflow,
instead of round-tripping, keeps the value clamped between the minimum and the
maximum values of the lane type.

This procedure returns a vector where each lane is the saturated difference of
the corresponding lanes of vectors `a` and `b`.

Inputs:
- `a`: An integer vector to subtract from.
- `b`: An integer vector.

Returns:
- Saturated difference of the two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		switch {
		case b[i] >= max(type_of(a[i])) + a[i]: // (overflow of a[i])
			res[i] = max(type_of(a[i]))
		case b[i] <= min(type_of(a[i])) + a[i]: // (underflow of a[i])
			res[i] = min(type_of(a[i]))
		} else {
			res[i] = a[i] - b[i]
		}
	}
	return res

Example:

Assuming unsigned bytes as the type of the element in a lane:

	   +-----+-----+-----+-----+
	a: |  0  | 255 |  2  |  3  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  3  |  3  |  2  | -1  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  0  | 252 |  0  |  4  |
	   +-----+-----+-----+-----+
*/
saturating_sub :: intrinsics.simd_saturating_sub

/*
Bitwise AND of the lanes of SIMD vectors.

This procedure returns a vector, such that each lane has the result of a bitwise
AND operation between the corresponding lanes of the vectors `a` and `b`.

Inputs:
- `a`: An integer or boolean vector.
- `b`: An integer or boolean vector.

Returns:
- Result of the bitwise AND operation between two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] & b[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: | 0x11 | 0x33 | 0x55 | 0xaa |
	   +------+------+------+------+
	   +------+------+------+------+
	b: | 0xff | 0xf0 | 0x0f | 0x00 |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   | 0x11 | 0x30 | 0x05 | 0x00 |
	   +------+------+------+------+
*/
bit_and     :: intrinsics.simd_bit_and

/*
Bitwise OR of the 2 lanes of vectors.

This procedure returns a vector, such that each lane has the result of a bitwise
OR operation between the corresponding lanes of the vectors `a` and `b`.

Inputs:
- `a`: An integer or boolean vector.
- `b`: An integer or boolean vector.

Returns:
- Result of the bitwise OR operation between two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] | b[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: | 0x11 | 0x33 | 0x55 | 0xaa |
	   +------+------+------+------+
	   +------+------+------+------+
	b: | 0xff | 0xf0 | 0x0f | 0x00 |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   | 0xff | 0xf3 | 0x5f | 0xaa |
	   +------+------+------+------+
*/
bit_or      :: intrinsics.simd_bit_or

/*
Bitwise XOR of the 2 lanes of vectors.

This procedure returns a vector, such that each lane has the result of a bitwise
XOR operation between the corresponding lanes of the vectors `a` and `b`.

Inputs:
- `a`: An integer or boolean vector.
- `b`: An integer or boolean vector.

Returns:
- Result of the bitwise XOR operation between two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] ~ b[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: | 0x11 | 0x33 | 0x55 | 0xaa |
	   +------+------+------+------+
	   +------+------+------+------+
	b: | 0xff | 0xf0 | 0x0f | 0x00 |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   | 0xee | 0xc3 | 0x5a | 0xaa |
	   +------+------+------+------+
*/
bit_xor     :: intrinsics.simd_bit_xor

/*
Bitwise AND NOT of the 2 lanes of vectors.

This procedure returns a vector, such that each lane has the result of a bitwise
AND NOT operation between the corresponding lanes of the vectors `a` and `b`.

Inputs:
- `a`: An integer or boolean vector.
- `b`: An integer or boolean vector.

Returns:
- Result of the bitwise AND NOT operation between two vectors.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = a[i] &~ b[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: | 0x11 | 0x33 | 0x55 | 0xaa |
	   +------+------+------+------+
	   +------+------+------+------+
	b: | 0xff | 0xf0 | 0x0f | 0x00 |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   | 0x00 | 0x03 | 0x50 | 0xaa |
	   +------+------+------+------+
*/
bit_and_not :: intrinsics.simd_bit_and_not

/*
Negation of a SIMD vector.

This procedure returns a vector where each lane is the negation of the
corresponding lane in the vector `a`.

Inputs:
- `a`: An integer or a float vector to negate

Returns:
- Negated vector.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = -a[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: |   0  |   1  |   2  |   3  |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   |   0  |  -1  |  -2  |  -3  |
	   +------+------+------+------+
*/
neg :: intrinsics.simd_neg

/*
Absolute value of a SIMD vector.

This procedure returns a vector where each lane has the absolute value of the
corresponding lane in the vector `a`.

Inputs:
- `a`: An integer or a float vector to negate

Returns:
- Absolute value of a vector.

**Operation**:

	for i in 0 ..< len(res) {
		switch {
			case a[i] < 0:  res[i] = -a[i]
			case a[i] > 0:  res[i] = a[i]
			case a[i] == 0: res[i] = 0
		}
	}
	return res

Example:

	   +------+------+------+------+
	a: |   0  |  -1  |   2  |  -3  |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   |   0  |   1  |   2  |   3  |
	   +------+------+------+------+
*/
abs   :: intrinsics.simd_abs

/*
Minimum of each lane of SIMD vectors.

This procedure returns a vector, such that each lane has is the minimum value
between the corresponding lanes in vectors `a` and `b`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- Vector with minimum values of each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] < b[i] {
			res[i] = a[i]
		} else {
			res[i] = b[i]
		}
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  0  |  1  |  2  |  3  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  |  2  |  1  | -1  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  0  |  1  |  1  | -1  |
	   +-----+-----+-----+-----+
*/
min   :: intrinsics.simd_min

/*
Maximum of each lane of SIMD vectors.

This procedure returns a vector, such that each lane has is the maximum value
between the corresponding lanes in vectors `a` and `b`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- Vector with maximum values of each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] > b[i] {
			res[i] = a[i]
		} else {
			res[i] = b[i]
		}
	}
	return res

Example:

	   +-----+-----+-----+-----+
	a: |  0  |  1  |  2  |  3  |
	   +-----+-----+-----+-----+
	   +-----+-----+-----+-----+
	b: |  0  |  2  |  1  | -1  |
	   +-----+-----+-----+-----+
	res:
	   +-----+-----+-----+-----+
	   |  0  |  2  |  2  |  3  |
	   +-----+-----+-----+-----+
*/
max   :: intrinsics.simd_max

/*
Clamp lanes of SIMD vector.

This procedure returns a vector, where each lane is the result of the
clamping of the lane from the vector `v` between the values in the corresponding
lanes of vectors `min` and `max`.

Inputs:
- `v`: An integer or a float vector with values to be clamped.
- `min`: An integer or a float vector with minimum bounds.
- `max`: An integer or a float vectoe with maximum bounds.

**Operation**:

	for i in len(res) {
		val := v[i]
		switch {
			case val < min: val = min
			case val > max: val = max
		}
		res[i] = val
	}
	return res

Example:

	     +-------+-------+-------+-------+
	v:   |  -1   |  0.3  |  1.2  |   1   |
	     +-------+-------+-------+-------+
	     +-------+-------+-------+-------+
	min: |   0   |   0   |   0   |   0   |
	     +-------+-------+-------+-------+
		 +-------+-------+-------+-------+
	max: |   1   |   1   |   1   |   1   |
	     +-------+-------+-------+-------+
	res:
	     +-------+-------+-------+-------+
	     |   0   |  0.3  |   1   |   1   |
	     +-------+-------+-------+-------+
*/
clamp :: intrinsics.simd_clamp

/*
Check if lanes of SIMD vectors are equal.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
they are equal, and if they are, the corresponding lane of the result vector
will have a value with all bits set (`0xff..ff`). Otherwise the lane of the
result vector will have the value `0`.

Inputs:
- `a`: An integer, a float or a boolean vector.
- `b`: An integer, a float or a boolean vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] == b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	   | 0xff  | 0x00  | 0xff  | 0x00  |
	   +-------+-------+-------+-------+
*/
lanes_eq :: intrinsics.simd_lanes_eq

/*
Check if lanes of SIMD vectors are not equal.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
they are not equal, and if they are, the corresponding lane of the result
vector will have a value with all bits set (`0xff..ff`). Otherwise the lane of
the result vector will have the value `0`.

Inputs:
- `a`: An integer, a float or a boolean vector.
- `b`: An integer, a float or a boolean vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] != b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	   | 0x00  | 0xff  | 0x00  | 0xff  |
	   +-------+-------+-------+-------+
*/
lanes_ne :: intrinsics.simd_lanes_ne

/*
Check if values of SIMD vector are less than the values of another SIMD vector.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
the lane of `a` is less than the lane of `b`, and if so, the corresponding lane
of the result vector will have a value with all bits set (`0xff..ff`). Otherwise
the lane of the result vector will have the value `0`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] < b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	r: | 0x00  | 0x00  | 0x00  | 0xff  |
	   +-------+-------+-------+-------+
*/
lanes_lt :: intrinsics.simd_lanes_lt

/*
Check if values of SIMD vector are less than or equal the values of another
SIMD vector.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
lane of `a` is less than or equal to the lane of `b`, and if so, the
corresponding lane of the result vector will have a value with all bits set
(`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] <= b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	   | 0xff  | 0x00  | 0xff  | 0xff  |
	   +-------+-------+-------+-------+
*/
lanes_le :: intrinsics.simd_lanes_le

/*
Check if values of SIMD vector are greater than the values of another SIMD
vector.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
lane of `a` is greater than to the lane of `b`, and if so, the corresponding
lane of the result vector will have a value with all bits set (`0xff..ff`).
Otherwise the lane of the result vector will have the value `0`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] > b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	   | 0x00  | 0x00  | 0x00  | 0xff  |
	   +-------+-------+-------+-------+
*/
lanes_gt :: intrinsics.simd_lanes_gt

/*
Check if values of SIMD vector are greater than or equal the values of another
SIMD vector.

This procedure checks each pair of lanes from vectors `a` and `b` for whether
lane of `a` is greater than or equal to the lane of `b`, and if so, the
corresponding lane of the result vector will have a value with all bits set
(`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.

Inputs:
- `a`: An integer or a float vector.
- `b`: An integer or a float vector.

Returns:
- A vector of unsigned integers of the same size as the input vector's lanes,
containing comparison results for each lane.

**Operation**:

	for i in 0 ..< len(res) {
		if a[i] >= b[i] {
			res[i] = unsigned(-1)
		} else {
			res[i] = 0
		}
	}
	return res

Example:

	   +-------+-------+-------+-------+
	a: |   0   |   1   |   2   |   3   |
	   +-------+-------+-------+-------+
	   +-------+-------+-------+-------+
	b: |   0   |   2   |   2   |   2   |
	   +-------+-------+-------+-------+
	res:
	   +-------+-------+-------+-------+
	   | 0xff  | 0x00  | 0xff  | 0xff  |
	   +-------+-------+-------+-------+
*/
lanes_ge :: intrinsics.simd_lanes_ge

/*
Perform a gather load into a SIMD vector.

A *gather* operation is memory load operation, that loads values from an vector
of addresses into a single value vector. This can be used to achieve the
following results:

- Accessing every N'th element of an array (strided access)
- Access of elements according to some computed offsets (indexed access).
- Access of elements in a different order (shuffling access).

When used alongside other SIMD procedures in order to compute the offsets
for the `ptr` and `mask` parameters.

Inputs:
- `ptr`: A vector of memory locations. Each pointer points to a single value,
	of a SIMD vector's lane type, that will be loaded into the vector. Pointer
	in this vector can be `nil` or any other invalid value, if the corresponding
	value in the `mask` parameter is zero.
- `val`: A vector of values that will be used at corresponding positions
	of the result vector, if the corresponding memory location has been
	masked out.
- `mask`: A vector of booleans or unsigned integers that determines which memory
	locations to read from. If the value at an index has the value true
	(lowest bit set), the value at that index will be loaded into the result
	vector from the corresponding memory location in the `ptr` vector. Otherwise
	the value will be loaded from the `val` vector.

Returns:

This procedure returns a vector with all values from unmasked indices
loaded from the pointer vector `ptr`, and all values from masked indices loaded
from the value vector `val`.

**Operation**:

	for i in 0 ..< len(res) {
		if mask[i]&1 == 1 {
			res[i] = ptr[i]^
		} else {
			res[i] = val[i]
		}
	}
	return res

Example:

Example below loads 2 lanes of values from 2 lanes of float vectors, `v1` and
`v2`. From each of these vectors we're loading the second value, into the first
and the third position of the result vector.

Therefore the `ptrs` argument is initialized such that the first and the third
value are the addresses of the values that we want to load into the result
vector, and we'll fill in `nil` for the rest of them. To prevent CPU from
dereferencing those `nil` addresses we provide the mask that only allows us
to load valid positions of the `ptrs` array, and the array of defaults which
will have `127` in each position as the default value.

	v1 := [4] f32 {1, 2, 3, 4};
	v2 := [4] f32 {9, 10,11,12};
	ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
	mask := #simd [4]bool { true, false, true, false }
	defaults := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
	res := simd.gather(ptrs, defaults, mask)
	fmt.println(res)

The code would print `<2, 127, 10, 127>`. First and the third positions came
from the `ptrs` array, and the other 2 lanes of from the default vector.
Graphic below shows how the values of the result are decided based on the mask:

	      +-------------------------------+ 
	mask: |   1   |   0   |   1   |   0   | 
	      +-------------------------------+ 
	        |         |       |       `----------------------------.
	        |         |       |                                    |
	        |          `----  |  ------------------------.         |
	        v                 v                          v         v
	      +-------------------------------+       +-------------------+
	ptrs: |  &m0  |  nil  |  &m2  |  nil  | vals: | d0 | d1 | d2 | d3 |
	      +-------------------------------+       +-------------------+
	          |               |                          |         |
	          |          .--- | -------------------------'         |
	          |         |     |          ,-------------------------'
	          v         v     v         v
	        +-------------------------------+
	result: |   m0  |   d1  |   m2  |  d3   |
	        +-------------------------------+
*/
gather  :: intrinsics.simd_gather

/*
Perform a scatter store from a SIMD vector.

A *scatter* operation is a memory store operation that stores values from a
vector into multiple memory locations. This operation is effectively the
opposite from the *gather* operation.

Inputs:
- `ptr`: A vector of memory locations. Each masked location will be written
	to with a value from the `val` vector. Pointer in this vector can be `nil`
	or any other invalid value, if the corresponding value in the `mask`
	parameter is zero.
- `val`: A vector of values to write to the memory locations.
- `mask`: A vector of booleans or unsigned integers, that decides which lanes
	get written to memory. If the value of the mask is `true` (the lowest bit
	set), the corresponding lane is written into memory. Otherwise it's not
	written into memory.

**Operation**:

	for i in 0 ..< len(ptr) {
		if mask[i]&1 == 1 {
			ptr[i]^ = val[i]
		}
	}

Example:

Example below writes value `127` to the second element of two different
vectors. The addresses of store destinations are written to the first and the
third argument of the `ptr` vector, and the `mask` is set accordingly.

	v1 := [4] f32 {1, 2, 3, 4};
	v2 := [4] f32 {5, 6, 7, 8};
	ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
	mask := #simd [4]bool { true, false, true, false }
	vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
	simd.scatter(ptrs, vals, mask)
	fmt.println(v1)
	fmt.println(v2)

This code prints the values of the two vectors, after modification by `scatter`:

	[1, 127, 3, 4]
	[5, 127, 7, 8]

Graphic below shows how the data gets written into memory.

	
	      +-------------------+
	mask: | 1  | 0  | 1  | 0  |
	      +-------------------+
	        |    |    |    |
	        v    X    v    X
	      +-------------------+
	vals: | d0 | d1 | d2 | d3 |
	      +-------------------+
	         |         \
	         v          v
	      +-----------------------+
	ptrs: | &m0 | nil | &m2 | nil |
	      +-----------------------+
*/
scatter :: intrinsics.simd_scatter

/*
Perform a masked load into the SIMD vector.

This procedure performs a masked load from memory, into the vector. The `ptr`
argument specifies the base address from which the values of SIMD vector
will be loaded. The mask selects the source for the result vector's lanes. If
the mask for the corresponding lane has the value `true` (lowest bit set), the
result lane is loaded from memory. Otherwise the result lane is loaded from the
corresponding lane of the `val` vector.

Inputs:
- `ptr`: The address of the vector values to load. Masked-off values are not
	accessed.
- `val`: The vector of values that will be loaded into the masked slots of the
	result vector.
- `mask`: The mask that selects where to load the values from.

Returns:
- The loaded vector. The lanes for which the mask was set, are loaded from
memory, and the other lanes are loaded from the `val` vector.

**Operation**:

	for i in 0 ..< len(res) {
		if mask[i]&1 == 1 {
			res[i] = ptr[i]
		} else {
			res[i] = vals[i]
		}
	}
	return res

Example:

The following code loads two values from the `src` vector, the first and the
third value (selected by the mask). The masked-off values are given the value
of 127 (`0x7f`).

	src := [4] f32 {1, 2, 3, 4};
	mask := #simd [4]bool { true, false, true, false }
	vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
	res := simd.masked_load(&src, vals, mask)
	fmt.println(res)

The above code prints the following:

	<1, 127, 3, 127>

Graphic below demonstrates the flow of lanes.

	      +-------------------------------+ 
	mask: |   1   |   0   |   1   |   0   | 
	      +-------------------------------+ 
	        |         |       |       `----------------------------.
	        |         |       |                                    |
	        |          `----  |  ------------------------.         |
	ptr     v                 v                          v         v
	+---->+-------------------------------+       +-------------------+
	      |   v1  |   v2  |   v3  |   v4  | vals: | d0 | d1 | d2 | d3 |
	      +-------------------------------+       +-------------------+
	          |               |                          |         |
	          |          .--- | -------------------------'         |
	          |         |     |          ,-------------------------'
	          v         v     v         v
	        +-------------------------------+
	result: |  v1   |   d1  |  v3   |  d3   |
	        +-------------------------------+
*/
masked_load  :: intrinsics.simd_masked_load

/*
Perform a masked store to memory.

This procedure performs a masked store, from a vector `val`, into memory at
address `ptr`, with the `mask` deciding which lanes are going to be stored,
and which aren't. If the mask at a corresponding lane has the value `true`
(lowest bit set), the lane is stored into memory. Otherwise the lane is not
stored into memory.

Inputs:
- `ptr`: The base address of the store.
- `val`: The vector to store.
- `mask`: The mask, selecting which lanes of the vector to store into memory.

**Operation**:

	for i in len(val) {
		if mask[i]&1 == 1 {
			ptr[i] = val
		}
	}

Example:

Example below stores the value 127 into the first and the third slot of the
vector `v`.

	v := [4] f32 {1, 2, 3, 4};
	mask := #simd [4]bool { true, false, true, false }
	vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
	simd.masked_store(&v, vals, mask)
	fmt.println(v)

After the masked store the printed result is:

	[127, 2, 127, 4]

Graphic below shows the flow of lanes:

	      +-------------------+
	mask: | 1  | 0  | 1  | 0  |
	      +-------------------+
	        |    |    |    |
			v    X    v    X
	      +-------------------+
	vals: | v0 | v1 | v2 | v3 |
	      +-------------------+
	         |         \
	ptr      v          v
	 +--->+-----------------------+
	      | v0  | ... | v2  | ... |
	      +-----------------------+
*/
masked_store :: intrinsics.simd_masked_store

/*
Load consecutive scalar values and expand into a SIMD vector.

This procedure loads a number of consecutive scalar values from an address,
specified by the `ptr` parameter, and stores them in a result vector, according
to the mask. The number of values read from memory is the number of set bits
in the mask. The lanes, for which the mask has the value `true` get the next
consecutive value from the memory, otherwise if the mask is `false` for the
lane, its value is filled from the corresponding lane of the `val` parameter.

This procedure acts like `masked_store`, except the values from the memory are
read consecutively, and not according to the lanes. The memory values are read
and assigned to the result vector's masked lanes in order of increasing
addresses.

Inputs:
- `ptr`: The pointer to the memory to read from.
- `vals`: The default values for masked-off entries.
- `mask`: The mask, that determines which lanes get consecutive memory values.

Returns:
- The result vector, holding masked memory values unmasked default values.

**Operation**:

	mem_idx := 0
	for i in 0 ..< len(mask) {
		if mask[i]&1 == 1 {
			res[i] = ptr[mem_idx]
			mem_idx += 1
		} else {
			res[i] = val[i]
		}
	}
	return res

Example:

The example below loads two values from memory of the vector `v`. Two values in
the mask are set to `true`, meaning only two memory items will be loaded into
the result vector. The mask is set to `true` in the first and the third
position, which specifies that the first memory item will be read into the
first lane of the result vector, and the second memory item will be read into
the third lane of the result vector. All the other lanes of the result vector
will be initialized to the default value `127`.

	v := [2] f64 {1, 2};
	mask := #simd [4]bool { true, false, true, false }
	vals := #simd [4]f64 { 0x7f, 0x7f, 0x7f, 0x7f }
	res := simd.masked_expand_load(&v, vals, mask)
	fmt.println(res)

The above code prints the following:

	<1, 127, 2, 127>

Graphical representation of the operation:


	ptr --->+-----------+-----
	        | m0  | m1  | ...
	        +-----------+-----
	          |      `--.
	          v         v
	        +-------------------+         +-------------------+
	mask:   | 1  | 0  | 1  | 0  |   vals: | v0 | v1 | v2 | v3 |
	        +-------------------+         +-------------------+
	          |         |                         |         |
	          |     .-- | -----------------------'          |
	          |    |    |     ,----------------------------'
	          v    v    v    v
	        +-------------------+
	result: | m0 | v1 | m1 | v3 |
	        +-------------------+
*/
masked_expand_load    :: intrinsics.simd_masked_expand_load

/*
Store masked values to consecutive memory locations.

This procedure stores values from masked lanes of a vector `val` consecutively
into memory. This operation is the opposite of `masked_expand_load`. The number
of items stored into memory is the number of set bits in the mask. If the value
in a lane of a mask is `true`, that lane is stored into memory. Otherwise
nothing is stored.

Inputs:
- `ptr`: The pointer to the memory of a store.
- `val`: The vector to store into memory.
- `mask`: The mask that selects which values to store into memory.

**Operation**:

	mem_idx := 0
	for i in 0 ..< len(mask) {
		if mask[i]&1 == 1 {
			ptr[mem_idx] = val[i]
			mem_idx += 1
		}
	}

Example:

The code below fills the vector `v` with two values from a 4-element SIMD
vector, the first and the third value. The items in the mask are set to `true`
in those lanes.

	v := [2] f64 { };
	mask := #simd [4]bool { true, false, true, false }
	vals := #simd [4]f64 { 1, 2, 3, 4 }
	simd.masked_compress_store(&v, vals, mask)
	fmt.println(v)

The code above prints the following:

	[1, 3]

Graphical representation of the operation:

	      +-------------------+
	mask: | 1  | 0  | 1  | 0  |
	      +-------------------+
	        |         |
	        v         v
	      +-------------------+
	vals: | v0 | v1 | v2 | v3 |
	      +-------------------+
	        |      ,--'
	ptr     v     v
	 +--->+-----------------
	      | v0  | v2  | ...
	      +-----------------
*/
masked_compress_store :: intrinsics.simd_masked_compress_store

/*
Extract scalar from a SIMD vector's lane.

This procedure returns the scalar, from the lane at the specified index of the
vector.

Inputs:
- `a`: The vector to extract from.
- `idx`: Lane index.

Returns:
- The value of the lane at the specified index.

**Operation**:

	return a[idx]
*/
extract :: intrinsics.simd_extract

/*
Replace the value in a SIMD vector's lane.

This procedure places a scalar value at the lane at the specified index of
the vector.

Inputs:
- `a`: The vector to replace a lane.
- `idx`: Lane index.
- `elem`: Scalar to place.

Returns:
- Vector with the specified lane replaced.

**Operation**:

	a[idx] = elem
*/
replace :: intrinsics.simd_replace

/*
Reduce SIMD vector to a scalar by adding all the lanes.

This procedure returns a scalar, that is the ordered sum of all SIMD lanes. The
ordered sum may be important for accounting for precision errors in
floating-point computation, as floating-point addition is not associative,
that is `(a+b)+c` may not equal to `a+(b+c)`.

Inputs:
- `a`: Vector to reduce

Result:
- Sum of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res += a[i]
	}
*/
reduce_add_ordered :: intrinsics.simd_reduce_add_ordered

/*
Reduce SIMD vector to a scalar by multiplying all the lanes.

This procedure returns a scalar, that is the ordered product of all SIMD lanes.
The ordered product may be important for accounting for precision errors in
floating-point computation, as floating-point multiplication is not associative,
that is `(a*b)*c` may not equal to `a*(b*c)`.

Inputs:
- `a`: Vector to reduce

Result:
- Product of all lanes, as a scalar.

**Operation**:

	res := 1
	for i in 0 ..< len(a) {
		res *= a[i]
	}
*/
reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered

/*
Reduce SIMD vector to a scalar by finding the minimum value between all of the lanes.

This procedure returns a scalar, that is the minimum value of all the lanes
in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Minimum value of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res = min(res, a[i])
	}
*/
reduce_min :: intrinsics.simd_reduce_min

/*
Reduce SIMD vector to a scalar by finding the maximum value between all of the lanes.

This procedure returns a scalar, that is the maximum value of all the lanes
in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Minimum value of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res = max(res, a[i])
	}
*/
reduce_max :: intrinsics.simd_reduce_max

/*
Reduce SIMD vector to a scalar by performing bitwise AND of all of the lanes.

This procedure returns a scalar, that is the result of the bitwise AND operation
between all of the lanes in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Bitwise AND of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res &= a[i]
	}
*/
reduce_and :: intrinsics.simd_reduce_and

/*
Reduce SIMD vector to a scalar by performing bitwise OR of all of the lanes.

This procedure returns a scalar, that is the result of the bitwise OR operation
between all of the lanes in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Bitwise AND of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res |= a[i]
	}
*/
reduce_or :: intrinsics.simd_reduce_or

/*
Reduce SIMD vector to a scalar by performing bitwise XOR of all of the lanes.

This procedure returns a scalar, that is the result of the bitwise XOR operation
between all of the lanes in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Bitwise XOR of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res ~= a[i]
	}
*/
reduce_xor :: intrinsics.simd_reduce_xor

/*
Reduce SIMD vector to a scalar by performing bitwise OR of all of the lanes.

This procedure returns a scalar, that is the result of the bitwise OR operation
between all of the lanes in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Bitwise OR of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res |= a[i]
	}
*/
reduce_any :: intrinsics.simd_reduce_any

/*
Reduce SIMD vector to a scalar by performing bitwise AND of all of the lanes.

This procedure returns a scalar, that is the result of the bitwise AND operation
between all of the lanes in a vector.

Inputs:
- `a`: Vector to reduce

Result:
- Bitwise AND of all lanes, as a scalar.

**Operation**:

	res := 0
	for i in 0 ..< len(a) {
		res &= a[i]
	}
*/
reduce_all :: intrinsics.simd_reduce_all

/*
Reorder the lanes of a SIMD vector.

This procedure reorders the lanes of a vector, according to the provided
indices. The number of indices correspond to the number of lanes in the
result vector and must be the same as the number of lanes of the input vector.
Each index specifies, the lane of the scalar from the input vector, which
will be written at the corresponding position of the result vector.

Inputs:
- `x`: The input vector.
- `indices`: The indices of lanes to write to the result vector.

Result:
- Swizzled input vector.

**Operation**:

	res = {}
	for i in 0 ..< len(indices) {
		res[i] = x[indices[i]]
	}
	return res

**Example**

The example below shows how the indices are used to determine which lanes of the
input vector get written into the result vector.
	
	x := #simd [4]f32 { 1.5, 2.5, 3.5, 4.5 }
	res := simd.swizzle(x, 0, 3, 1, 1)
	fmt.println("res")

The above code will print the following to the console:

	[ 1.5, 3.5, 2.5, 2.5 ]

The graphical representation of the operation is as follows. The `idx` vector in
the picture represents the `indices` parameter:

	      0     1     2     3
	      +-----+-----+-----+-----+
	x:    | 1.5 | 2.5 | 3.5 | 4.5 |
	      +-----+-----+-----+-----+
	         ^     ^           ^
	         |     |           |
	         |      '----.     |
	         |     .---- | ---'
	         |     |     |
	         |     |     +------.
	      +-----+-----+-----+-----+
	idx:  |  0  |  3  |  1  |  1  |
	      +-----+-----+-----+-----+
	         ^     ^     ^     ^
	         |     |     |     |
	      +-----+-----+-----+-----+
	res:  | 1.5 | 3.5 | 2.5 | 2.5 |
	      +-----+-----+-----+-----+
*/
swizzle :: builtin.swizzle


/*
Reorder the lanes of two SIMD vectors.

This procedure returns a vector, containing the scalars from the lanes of two
vectors, according to the provided indices vector. Each index in the indices
vector specifies, the lane of the scalar from one of the two input vectors,
which will be written at the corresponding position of the result vector. If
the index is within bounds 0 ..< len(A), it corresponds to the indices of the
first input vector. Otherwise the index corresponds to the indices of the second
input vector.

Inputs:
- `a`: The first input vector.
- `b`: The second input vector.
- `indices`: The indices.

Result:
- Input vectors, shuffled according to the indices.

**Operation**:

	res = {}
	for i in 0 ..< len(indices) {
		idx = indices[i];
		if idx < len(a) {
			res[i] = a[idx]
		} else {
			res[i] = b[idx]
		}
	}
	return res

**Example**

The example below shows how the indices are used to determine lanes of the
input vector that are shuffled into the result vector.
	
	a := #simd [4]f32 { 1, 2, 3, 4 }
	b := #simd [4]f32 { 5, 6, 7, 8 }
	indices := #simd[4]
	res := simd.swizzle(x, 0, 4, 2, 5)
	fmt.println("res")

The above code will print the following to the console:

	[ 1, 5, 3, 6 ]

The graphical representation of the operation is as follows. The `idx` vector in
the picture represents the `indices` parameter:

	      0     1     2     3            4     5     6     7
	      +-----+-----+-----+-----+      +-----+-----+-----+-----+
	a:    |  1  |  2  |  3  |  4  |  b:  |  5  |  6  |  7  |  8  |
	      +-----+-----+-----+-----+      +-----+-----+-----+-----+
	         ^           ^                  ^     ^
	         |           |                  |     |
	         |           |                  |     |
	         |      .--- | ----------------'      |
	         |     |     |     .-----------------'
	      +-----+-----+-----+-----+
	idx:  |  0  |  4  |  2  |  5  |
	      +-----+-----+-----+-----+
	         ^     ^     ^     ^
	         |     |     |     |
	      +-----+-----+-----+-----+
	res:  |  1  |  5  |  3  |  6  |
	      +-----+-----+-----+-----+
*/
shuffle :: intrinsics.simd_shuffle

/*
Select values from one of the two vectors.

This procedure returns a vector, which has, on each lane a value from one of the
corresponding lanes in one of the two input vectors based on the `cond`
parameter. On each lane, if the value of the `cond` parameter is `true` (or
non-zero), the result lane will have a value from the `true` input vector,
otherwise the result lane will have a value from the `false` input vector.

Inputs:
- `cond`: The condition vector.
- `true`: The first input vector.
- `false`: The second input vector.

Result:
- The result of selecting values from the two input vectors.

**Operation**:

	res = {}
	for i in 0 ..< len(cond) {
		if cond[i] {
			res[i] = true[i]
		} else {
			res[i] = false[i]
		}
	}
	return res

**Example**:

The following example selects values from the two input vectors, `a` and `b`
into a single vector.

	a := #simd [4] f64 { 1,2,3,4 };
	b := #simd [4] f64 { 5,6,7,8 };
	cond := #simd[4] int { 1, 0, 1, 0 }
	fmt.println(simd.select(cond,a,b))

Output:
	
	[ 1, 6, 3, 8 ]

Graphically, the operation looks as follows. The `t` and `f` represent the
`true` and `false` vectors respectively:

	      0     1     2     3            0     1     2     3
	      +-----+-----+-----+-----+      +-----+-----+-----+-----+
	t:    |  1  |  2  |  3  |  4  |  f:  |  5  |  6  |  7  |  8  |
	      +-----+-----+-----+-----+      +-----+-----+-----+-----+
	         ^           ^                        ^           ^
	         |           |                        |           |
	         |           |                        |           |
	         |      .--- | ----------------------'            |
	         |     |     |     .-----------------------------'
	      +-----+-----+-----+-----+
	cond: |  1  |  0  |  1  |  0  |
	      +-----+-----+-----+-----+
	         ^     ^     ^     ^
	         |     |     |     |
	      +-----+-----+-----+-----+
	res:  |  1  |  5  |  3  |  6  |
	      +-----+-----+-----+-----+
*/
select :: intrinsics.simd_select

/*
Compute the square root of each lane in a SIMD vector.
*/
sqrt    :: intrinsics.sqrt

/*
Ceil each lane in a SIMD vector.
*/
ceil    :: intrinsics.simd_ceil

/*
Floor each lane in a SIMD vector.
*/
floor   :: intrinsics.simd_floor

/*
Truncate each lane in a SIMD vector.
*/
trunc   :: intrinsics.simd_trunc

/*
Compute the nearest integer of each lane in a SIMD vector.
*/
nearest :: intrinsics.simd_nearest

/*
Transmute a SIMD vector into an integer vector.
*/
to_bits :: intrinsics.simd_to_bits

/*
Reverse the lanes of a SIMD vector.

This procedure reverses the lanes of a SIMD vector, putting last lane in the
first spot, etc. This procedure is equivalent to the following call (for
4-element vectors):

	swizzle(a, 3, 2, 1, 0)
*/
lanes_reverse :: intrinsics.simd_lanes_reverse

/*
Rotate the lanes of a SIMD vector left.

This procedure rotates the lanes of a SIMD vector, putting the first lane of the
last spot, second lane in the first spot, third lane in the second spot, etc.
For 4-element vectors, this procedure is equvalent to the following:

	swizzle(a, 1, 2, 3, 0)
*/
lanes_rotate_left :: intrinsics.simd_lanes_rotate_left

/*
Rotate the lanes of a SIMD vector right.

This procedure rotates the lanes of a SIMD vector, putting the first lane of the
second spot, second lane in the third spot, etc. For 4-element vectors, this
procedure is equvalent to the following:

	swizzle(a, 3, 0, 1, 2)
*/
lanes_rotate_right :: intrinsics.simd_lanes_rotate_right

/*
Count the number of set bits in each lane of a SIMD vector.
*/
count_ones :: intrinsics.count_ones

/*
Count the number of unset bits in each lane of a SIMD vector.
*/
count_zeros :: intrinsics.count_zeros

/*
Count the number of trailing unset bits in each lane of a SIMD vector.
*/
count_trailing_zeros :: intrinsics.count_trailing_zeros

/*
Count the number of leading unset bits in each lane of a SIMD vector.
*/
count_leading_zeros :: intrinsics.count_leading_zeros

/*
Reverse the bit pattern of a SIMD vector.
*/
reverse_bits :: intrinsics.reverse_bits

/*
Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.

A fused multiply-add is a ternary operation that for three operands, `a`, `b`
and `c`  performs the operation `a*b+c`. This operation is a hardware feature
that allows to minimize floating-point error and allow for faster computation.

This procedure performs a FMA operation on each lane of the SIMD vectors.

Inputs:
- `a`: The multiplier
- `b`: The multiplicand
- `c`: The addend

Returns:
- `a*b+c`

**Operation**

	res := 0
	for i in 0 ..< len(a) {
		res[i] = fma(a[i], b[i], c[i])
	}
	return res
*/
fused_mul_add :: intrinsics.fused_mul_add

/*
Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.

A fused multiply-add is a ternary operation that for three operands, `a`, `b`
and `c`  performs the operation `a*b+c`. This operation is a hardware feature
that allows to minimize floating-point error and allow for faster computation.

This procedure performs a FMA operation on each lane of the SIMD vectors.

Inputs:
- `a`: The multiplier
- `b`: The multiplicand
- `c`: The addend

Returns:
- `a*b+c`

**Operation**

	res := 0
	for i in 0 ..< len(a) {
		res[i] = fma(a[i], b[i], c[i])
	}
	return res
*/
fma :: intrinsics.fused_mul_add

/*
Convert pointer to SIMD vector to an array pointer.
*/
to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
	return (^[LANES]E)(v)
}

/*
Convert SIMD vector to an array.
*/
to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
	return transmute([LANES]E)(v)
}

/*
Convert array to SIMD vector.
*/
from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
	return transmute(#simd[LANES]E)v
}

/*
Convert slice to SIMD vector.
*/
from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
	assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
	array: [LANES]E
	#no_bounds_check for i in 0..<LANES {
		array[i] = slice[i]
	}
	return transmute(T)array
}

/*
Perform binary not operation on a SIMD vector.

This procedure returns a vector where each lane is the result of the binary
NOT operation of the corresponding lane in the vector `a`.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = ~a[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: | 0x00 | 0x50 | 0x80 | 0xff |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   | 0xff | 0xaf | 0x7f | 0x00 |
	   +------+------+------+------+
*/
bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
	return xor(v, T(~E(0)))
}

/*
Copy the signs from lanes of one SIMD vector into another SIMD vector.
*/
copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
	neg_zero := to_bits(T(-0.0))
	sign_bit := to_bits(sign) & neg_zero
	magnitude := to_bits(v) &~ neg_zero
	return transmute(T)(sign_bit|magnitude)
}

/*
Return signs of SIMD lanes.

This procedure returns a vector, each lane of which contains either +1.0 or
-1.0 depending on the sign of the value in the corresponding lane of the
input vector. If the lane of the input vector has NaN, then the result vector
will contain this NaN value as-is.
*/
signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
	is_nan := lanes_ne(v, v)
	return select(is_nan, v, copysign(T(1), v))
}

/*
Calculate reciprocals of SIMD lanes.

This procedure returns a vector where each lane is the reciprocal of the
corresponding lane in the vector `a`.

Inputs:
- `a`: An integer or a float vector to negate

Returns:
- Negated vector.

**Operation**:

	for i in 0 ..< len(res) {
		res[i] = 1.0 / a[i]
	}
	return res

Example:

	   +------+------+------+------+
	a: |   0  |   1  |   3  |   5  |
	   +------+------+------+------+
	res:
	   +------+------+------+------+
	   |   0  |   1  | 0.33 |  0.2 |
	   +------+------+------+------+
*/
recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
	return T(1) / v
}