diff --git a/core/simd/x86/abm.odin b/core/simd/x86/abm.odin index f1898811f..5d7549ab3 100644 --- a/core/simd/x86/abm.odin +++ b/core/simd/x86/abm.odin @@ -3,17 +3,21 @@ package simd_x86 import "core:intrinsics" +@(require_results) _lzcnt_u32 :: #force_inline proc "c" (x: u32) -> u32 { return intrinsics.count_leading_zeros(x) } +@(require_results) _popcnt32 :: #force_inline proc "c" (x: u32) -> i32 { return i32(intrinsics.count_ones(x)) } when ODIN_ARCH == .amd64 { + @(require_results) _lzcnt_u64 :: #force_inline proc "c" (x: u64) -> u64 { return intrinsics.count_leading_zeros(x) } + @(require_results) _popcnt64 :: #force_inline proc "c" (x: u64) -> i32 { return i32(intrinsics.count_ones(x)) } diff --git a/core/simd/x86/adx.odin b/core/simd/x86/adx.odin index e73aa03a6..d03cffcff 100644 --- a/core/simd/x86/adx.odin +++ b/core/simd/x86/adx.odin @@ -1,14 +1,17 @@ //+build i386, amd64 package simd_x86 +@(require_results) _addcarry_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { x, y := llvm_addcarry_u32(c_in, a, b) out^ = y return x } +@(require_results) _addcarryx_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { return llvm_addcarryx_u32(c_in, a, b, out) } +@(require_results) _subborrow_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { x, y := llvm_subborrow_u32(c_in, a, b) out^ = y @@ -16,14 +19,17 @@ _subborrow_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) - } when ODIN_ARCH == .amd64 { + @(require_results) _addcarry_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { x, y := llvm_addcarry_u64(c_in, a, b) out^ = y return x } + @(require_results) _addcarryx_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { return llvm_addcarryx_u64(c_in, a, b, out) } + @(require_results) _subborrow_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { x, y := llvm_subborrow_u64(c_in, a, b) out^ = y diff --git a/core/simd/x86/pclmulqdq.odin b/core/simd/x86/pclmulqdq.odin index 8a665db03..692fb7ce1 100644 --- a/core/simd/x86/pclmulqdq.odin +++ b/core/simd/x86/pclmulqdq.odin @@ -1,7 +1,7 @@ //+build i386, amd64 package simd_x86 -@(enable_target_feature="pclmulqdq") +@(require_results, enable_target_feature="pclmulqdq") _mm_clmulepi64_si128 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { return pclmulqdq(a, b, u8(IMM8)) } diff --git a/core/simd/x86/rdtsc.odin b/core/simd/x86/rdtsc.odin index 91dcc4ec9..54024c3f2 100644 --- a/core/simd/x86/rdtsc.odin +++ b/core/simd/x86/rdtsc.odin @@ -1,10 +1,12 @@ //+build i386, amd64 package simd_x86 +@(require_results) _rdtsc :: #force_inline proc "c" () -> u64 { return rdtsc() } +@(require_results) __rdtscp :: #force_inline proc "c" (aux: ^u32) -> u64 { return rdtscp(aux) } diff --git a/core/simd/x86/sha.odin b/core/simd/x86/sha.odin index 90f1d72ce..f015f4b8a 100644 --- a/core/simd/x86/sha.odin +++ b/core/simd/x86/sha.odin @@ -1,31 +1,31 @@ //+build i386, amd64 package simd_x86 -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha1msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)sha1msg1(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha1msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)sha1msg2(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha1nexte_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)sha1nexte(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha1rnds4_epu32 :: #force_inline proc "c" (a, b: __m128i, $FUNC: u32) -> __m128i where 0 <= FUNC, FUNC <= 3 { return transmute(__m128i)sha1rnds4(transmute(i32x4)a, transmute(i32x4)b, u8(FUNC & 0xff)) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha256msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)sha256msg1(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha256msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)sha256msg2(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sha") +@(require_results, enable_target_feature="sha") _mm_sha256rnds2_epu32 :: #force_inline proc "c" (a, b, k: __m128i) -> __m128i { return transmute(__m128i)sha256rnds2(transmute(i32x4)a, transmute(i32x4)b, transmute(i32x4)k) } diff --git a/core/simd/x86/sse.odin b/core/simd/x86/sse.odin index 6d8939b1b..3efdeccba 100644 --- a/core/simd/x86/sse.odin +++ b/core/simd/x86/sse.odin @@ -43,299 +43,299 @@ _MM_FLUSH_ZERO_ON :: 0x8000 _MM_FLUSH_ZERO_OFF :: 0x0000 -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_add_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return addss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_add_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.add(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_sub_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return subss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_sub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.sub(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_mul_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return mulss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_mul_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.mul(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_div_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return divss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_div_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.div(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_sqrt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return sqrtss(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_sqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return sqrtps(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_rcp_ss :: #force_inline proc "c" (a: __m128) -> __m128 { return rcpss(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_rcp_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return rcpps(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_rsqrt_ss :: #force_inline proc "c" (a: __m128) -> __m128 { return rsqrtss(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_rsqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return rsqrtps(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_min_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return minss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_min_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return minps(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_max_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return maxss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_max_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return maxps(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_and_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return transmute(__m128)simd.and(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_andnot_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return transmute(__m128)simd.and_not(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_or_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return transmute(__m128)simd.or(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_xor_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return transmute(__m128)simd.xor(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpeq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 0) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmplt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 1) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmple_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 2) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpgt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, cmpss(b, a, 1), 4, 1, 2, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, cmpss(b, a, 2), 4, 1, 2, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpneq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 4) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnlt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 5) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnle_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 6) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpngt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, cmpss(b, a, 5), 4, 1, 2, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, cmpss(b, a, 6), 4, 1, 2, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 7) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpunord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpss(a, b, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpeq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 0) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmplt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 1) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmple_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 2) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpgt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 1) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 2) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpneq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 4) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnlt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 5) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnle_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(a, b, 6) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpngt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 5) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpnge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 6) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 7) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cmpunord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return cmpps(b, a, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comieq_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comilt_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comile_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comigt_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comige_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_comineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return comineq_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomieq_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomilt_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomile_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomigt_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomige_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_ucomineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { return ucomineq_ss(a, b) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cvtss_si32 :: #force_inline proc "c" (a: __m128) -> i32 { return cvtss2si(a) } _mm_cvt_ss2si :: _mm_cvtss_si32 _mm_cvttss_si32 :: _mm_cvtss_si32 -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cvtss_f32 :: #force_inline proc "c" (a: __m128) -> f32 { return simd.extract(a, 0) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_cvtsi32_ss :: #force_inline proc "c" (a: __m128, b: i32) -> __m128 { return cvtsi2ss(a, b) } _mm_cvt_si2ss :: _mm_cvtsi32_ss -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_set_ss :: #force_inline proc "c" (a: f32) -> __m128 { return __m128{a, 0, 0, 0} } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_set1_ps :: #force_inline proc "c" (a: f32) -> __m128 { return __m128(a) } _mm_set_ps1 :: _mm_set1_ps -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_set_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 { return __m128{d, c, b, a} } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_setr_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 { return __m128{a, b, c, d} } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_setzero_ps :: #force_inline proc "c" () -> __m128 { return __m128{0, 0, 0, 0} } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 { return simd.shuffle( a, b, @@ -346,58 +346,58 @@ _mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 { } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_unpackhi_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, b, 2, 6, 3, 7) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_unpacklo_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, b, 0, 4, 1, 5) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_movehl_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, b, 6, 7, 2, 3) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_movelh_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, b, 0, 1, 4, 5) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_movemask_ps :: #force_inline proc "c" (a: __m128) -> u32 { return movmskps(a) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_load_ss :: #force_inline proc "c" (p: ^f32) -> __m128 { return __m128{p^, 0, 0, 0} } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_load1_ps :: #force_inline proc "c" (p: ^f32) -> __m128 { a := p^ return __m128(a) } _mm_load_ps1 :: _mm_load1_ps -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_load_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { return (^__m128)(p)^ } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_loadu_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { dst := _mm_undefined_ps() intrinsics.mem_copy_non_overlapping(&dst, p, size_of(__m128)) return dst } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_loadr_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { return simd.lanes_reverse(_mm_load_ps(p)) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_loadu_si64 :: #force_inline proc "c" (mem_addr: rawptr) -> __m128i { a := intrinsics.unaligned_load((^i64)(mem_addr)) return __m128i{a, 0} @@ -431,7 +431,7 @@ _mm_storer_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) { } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_move_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return simd.shuffle(a, b, 4, 1, 2, 3) } @@ -441,7 +441,7 @@ _mm_sfence :: #force_inline proc "c" () { sfence() } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_getcsr :: #force_inline proc "c" () -> (result: u32) { stmxcsr(&result) return result @@ -453,19 +453,19 @@ _mm_setcsr :: #force_inline proc "c" (val: u32) { ldmxcsr(&val) } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _MM_GET_EXCEPTION_MASK :: #force_inline proc "c" () -> u32 { return _mm_getcsr() & _MM_MASK_MASK } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _MM_GET_EXCEPTION_STATE :: #force_inline proc "c" () -> u32 { return _mm_getcsr() & _MM_EXCEPT_MASK } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _MM_GET_FLUSH_ZERO_MODE :: #force_inline proc "c" () -> u32 { return _mm_getcsr() & _MM_FLUSH_ZERO_MASK } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _MM_GET_ROUNDING_MODE :: #force_inline proc "c" () -> u32 { return _mm_getcsr() & _MM_ROUND_MASK } @@ -493,7 +493,7 @@ _mm_prefetch :: #force_inline proc "c" (p: rawptr, $STRATEGY: u32) { } -@(enable_target_feature="sse") +@(require_results, enable_target_feature="sse") _mm_undefined_ps :: #force_inline proc "c" () -> __m128 { return _mm_set1_ps(0) } @@ -517,15 +517,15 @@ _mm_stream_ps :: #force_inline proc "c" (addr: [^]f32, a: __m128) { } when ODIN_ARCH == .amd64 { - @(enable_target_feature="sse") + @(require_results, enable_target_feature="sse") _mm_cvtss_si64 :: #force_inline proc "c"(a: __m128) -> i64 { return cvtss2si64(a) } - @(enable_target_feature="sse") + @(require_results, enable_target_feature="sse") _mm_cvttss_si64 :: #force_inline proc "c"(a: __m128) -> i64 { return cvttss2si64(a) } - @(enable_target_feature="sse") + @(require_results, enable_target_feature="sse") _mm_cvtsi64_ss :: #force_inline proc "c"(a: __m128, b: i64) -> __m128 { return cvtsi642ss(a, b) } diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin index d15df8120..f33bd2195 100644 --- a/core/simd/x86/sse2.odin +++ b/core/simd/x86/sse2.odin @@ -21,118 +21,118 @@ _mm_mfence :: #force_inline proc "c" () { mfence() } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add(transmute(i64x2)a, transmute(i64x2)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_adds_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add_sat(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_adds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add_sat(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_adds_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add_sat(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_adds_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.add_sat(transmute(u16x8)a, transmute(u16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_avg_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pavgb(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_avg_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pavgw(transmute(u16x8)a, transmute(u16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_madd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaddwd(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_max_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxsw(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_max_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxub(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_min_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminsw(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_min_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminub(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mulhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmulhw(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mulhi_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmulhuw(transmute(u16x8)a, transmute(u16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mullo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.mul(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mul_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmuludq(transmute(u32x4)a, transmute(u32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sad_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)psadbw(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub(transmute(i64x2)a, transmute(i64x2)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_subs_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub_sat(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_subs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub_sat(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_subs_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub_sat(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.sub_sat(transmute(u16x8)a, transmute(u16x8)b) } @@ -140,7 +140,7 @@ _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { @(private) -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { shift :: IMM8 & 0xff @@ -167,7 +167,7 @@ _mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128 } @(private) -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { shift :: IMM8 return transmute(__m128i)simd.shuffle( @@ -193,233 +193,233 @@ _mm_srli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128 } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_slli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return _mm_slli_si128_impl(a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_bslli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return _mm_slli_si128_impl(a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_bsrli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return _mm_srli_si128_impl(a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_slli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)pslliw(transmute(i16x8)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sll_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psllw(transmute(i16x8)a, transmute(i16x8)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_slli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psllid(transmute(i32x4)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sll_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)pslld(transmute(i32x4)a, transmute(i32x4)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_slli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)pslliq(transmute(i64x2)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sll_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psllq(transmute(i64x2)a, transmute(i64x2)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srai_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psraiw(transmute(i16x8)a. IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sra_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psraw(transmute(i16x8)a, transmute(i16x8)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srai_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psraid(transmute(i32x4)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sra_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psrad(transmute(i32x4)a, transmute(i32x4)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return _mm_srli_si128_impl(a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psrliw(transmute(i16x8)a. IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srl_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psrlw(transmute(i16x8)a, transmute(i16x8)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psrlid(transmute(i32x4)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srl_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psrld(transmute(i32x4)a, transmute(i32x4)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { return transmute(__m128i)psrliq(transmute(i64x2)a, IMM8) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_srl_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { return transmute(__m128i)psrlq(transmute(i64x2)a, transmute(i64x2)count) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_and_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return simd.and(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_andnot_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return simd.and_not(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_or_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return simd.or(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_xor_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return simd.xor(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpeq_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_eq(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpeq_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_eq(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpeq_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_eq(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpgt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_gt(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpgt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_gt(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpgt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_gt(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmplt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_lt(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmplt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_lt(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmplt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_lt(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m128d { v := transmute(i32x4)a return cast(__m128d)simd.shuffle(v, v, 0, 1) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsi32_sd :: #force_inline proc "c" (a: __m128d, b: i32) -> __m128d { return simd.replace(a, 0, f64(b)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtepi32_ps :: #force_inline proc "c" (a: __m128i) -> __m128 { return cvtdq2ps(transmute(i32x4)a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i { return transmute(__m128i)cvtps2dq(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsi32_si128 :: #force_inline proc "c" (a: i32) -> __m128i { return transmute(__m128i)i32x4{a, 0, 0, 0} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsi128_si32 :: #force_inline proc "c" (a: __m128i) -> i32 { return simd.extract(transmute(i32x4)a, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_epi64x :: #force_inline proc "c" (e1, e0: i64) -> __m128i { return transmute(__m128i)i64x2{e0, e1} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { return transmute(__m128i)i32x4{e0, e1, e2, e3} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { return transmute(__m128i)i16x8{e0, e1, e2, e3, e4, e5, e6, e7} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { return transmute(__m128i)i8x16{e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m128i { return _mm_set_epi64x(a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m128i { return _mm_set_epi32(a, a, a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m128i { return _mm_set_epi16(a, a, a, a, a, a, a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m128i { return _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setr_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { return _mm_set_epi32(e0, e1, e2, e3) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setr_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { return _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setr_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { return _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setzero_si128 :: #force_inline proc "c" () -> __m128i { return _mm_set1_epi64x(0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadl_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { return _mm_set_epi64x(0, intrinsics.unaligned_load((^i64)(mem_addr))) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_load_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { return mem_addr^ } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { dst := _mm_undefined_si128() intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128i)) @@ -450,7 +450,7 @@ _mm_stream_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { _mm_stream_si32 :: #force_inline proc "c" (mem_addr: ^i32, a: i32) { intrinsics.non_temporal_store(mem_addr, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { zero := _mm_setzero_si128() return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)zero, 0, 2) @@ -459,31 +459,31 @@ _mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_packs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)packsswb(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_packs_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)packssdw(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_packus_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)packuswb(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_extract_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { return i32(simd.extract(transmute(u16x8)a, IMM8)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_insert_epi16 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { return i32(simd.replace(transmute(u16x8)a, IMM8, i16(i))) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_movemask_epi8 :: #force_inline proc "c" (a: __m128i) -> i32 { return pmovmskb(transmute(i8x16)a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_shuffle_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { v := transmute(i32x4)a return transmute(__m128i)simd.shuffle( @@ -495,7 +495,7 @@ _mm_shuffle_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i (IMM8 >> 6) & 0b11, ) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_shufflehi_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { v := transmute(i16x8)a return transmute(__m128i)simd.shuffle( @@ -511,7 +511,7 @@ _mm_shufflehi_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128 ((IMM8 >> 6) & 0b11) + 4, ) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_shufflelo_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { v := transmute(i16x8)a return transmute(__m128i)simd.shuffle( @@ -527,7 +527,7 @@ _mm_shufflelo_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128 7, ) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpackhi_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle( transmute(i8x16)a, @@ -535,19 +535,19 @@ _mm_unpackhi_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, ) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpackhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 4, 12, 5, 13, 6, 14, 7, 15) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpackhi_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 2, 6, 3, 7) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpackhi_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 1, 3) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpacklo_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle( transmute(i8x16)a, @@ -555,15 +555,15 @@ _mm_unpacklo_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, ) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpacklo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 0, 8, 1, 9, 2, 10, 3, 11) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpacklo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 0, 4, 1, 5) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpacklo_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 0, 2) } @@ -571,75 +571,75 @@ _mm_unpacklo_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_add_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.add(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_div_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_div_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.div(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_max_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return maxsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_max_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return maxpd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_min_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return minsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_min_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return minpd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mul_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_mul_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.mul(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sqrt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(a, 0, _mm_cvtsd_f64(sqrtsd(b))) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sqrt_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { return simd.sqrt(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_sub_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.sub(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_and_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return transmute(__m128d)_mm_and_si128(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_andnot_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return transmute(__m128d)_mm_andnot_si128(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_or_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return transmute(__m128d)_mm_or_si128(transmute(__m128i)a, transmute(__m128i)b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_xor_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return transmute(__m128d)_mm_xor_si128(transmute(__m128i)a, transmute(__m128i)b) } @@ -647,147 +647,147 @@ _mm_xor_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpeq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmplt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 1) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmple_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 2) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpgt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(_mm_cmplt_sd(b, a), 1, simd.extract(a, 1)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(_mm_cmple_sd(b, a), 1, simd.extract(a, 1)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 7) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpunord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 3) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpneq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 4) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnlt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 5) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnle_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmpsd(a, b, 6) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpngt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(_mm_cmpnlt_sd(b, a), 1, simd.extract(a, 1)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.replace(_mm_cmpnle_sd(b, a), 1, simd.extract(a, 1)) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpeq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmplt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 1) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmple_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 2) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpgt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return _mm_cmplt_pd(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return _mm_cmple_pd(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 7) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpunord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 3) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpneq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 4) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnlt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 5) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnle_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return cmppd(a, b, 6) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpngt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return _mm_cmpnlt_pd(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cmpnge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return _mm_cmpnle_pd(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comieqsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comiltsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comilesd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comigtsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comigesd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_comineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return comineqsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomieqsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomiltsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomilesd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomigtsd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomigesd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_ucomineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { return ucomineqsd(a, b) } @@ -796,87 +796,87 @@ _mm_ucomineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 { return cvtpd2ps(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m128d { return cvtps2pd(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i { return transmute(__m128i)cvtpd2dq(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 { return cvtsd2si(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsd_ss :: #force_inline proc "c" (a, b: __m128d) -> __m128 { return cvtsd2ss(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtsd_f64 :: #force_inline proc "c" (a: __m128d) -> f64 { return simd.extract(a, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvtss_sd :: #force_inline proc "c" (a, b: __m128) -> __m128d { return cvtss2sd(a, b) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvttpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i { return transmute(__m128i)cvttpd2dq(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvttsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 { return cvttsd2si(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_cvttps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i { return transmute(__m128i)cvttps2dq(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_sd :: #force_inline proc "c" (a: f64) -> __m128d { return _mm_set_pd(0.0, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set1_pd :: #force_inline proc "c" (a: f64) -> __m128d { return _mm_set_pd(a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_pd1 :: #force_inline proc "c" (a: f64) -> __m128d { return _mm_set_pd(a, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_set_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d { return __m128d{b, a} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setr_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d { return _mm_set_pd(b, a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_setzero_pd :: #force_inline proc "c" () -> __m128d { return _mm_set_pd(0.0, 0.0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_movemask_pd :: #force_inline proc "c" (a: __m128d) -> i32 { return movmskpd(a) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { return (^__m128d)(mem_addr)^ } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_load_sd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { return _mm_setr_pd(mem_addr^, 0.) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadh_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d { return _mm_setr_pd(simd.extract(a, 0), mem_addr^) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadl_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d { return _mm_setr_pd(mem_addr^, simd.extract(a, 1)) } @@ -916,31 +916,31 @@ _mm_storeh_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { _mm_storel_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { mem_addr^ = simd.extract(a, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_load1_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { d := mem_addr^ return _mm_setr_pd(d, d) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_load_pd1 :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { return _mm_load1_pd(mem_addr) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadr_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { a := _mm_load_pd(mem_addr) return simd.shuffle(a, a, 1, 0) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { dst := _mm_undefined_pd() intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128d)) return dst } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_shuffle_pd :: #force_inline proc "c" (a, b: __m128d, $MASK: u32) -> __m128d { return simd.shuffle(a, b, MASK&0b1, ((MASK>>1)&0b1) + 2) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_move_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return _mm_setr_pd(simd.extract(b, 0), simd.extract(a, 1)) } @@ -948,64 +948,64 @@ _mm_move_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 { return transmute(__m128)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castpd_si128 :: #force_inline proc "c" (a: __m128d) -> __m128i { return transmute(__m128i)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castps_pd :: #force_inline proc "c" (a: __m128) -> __m128d { return transmute(__m128d)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castps_si128 :: #force_inline proc "c" (a: __m128) -> __m128i { return transmute(__m128i)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castsi128_pd :: #force_inline proc "c" (a: __m128i) -> __m128d { return transmute(__m128d)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_castsi128_ps :: #force_inline proc "c" (a: __m128i) -> __m128 { return transmute(__m128)a } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_undefined_pd :: #force_inline proc "c" () -> __m128d { return __m128d{0, 0} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_undefined_si128 :: #force_inline proc "c" () -> __m128i { return __m128i{0, 0} } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpackhi_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.shuffle(a, b, 1, 3) } -@(enable_target_feature="sse2") +@(require_results, enable_target_feature="sse2") _mm_unpacklo_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return simd.shuffle(a, b, 0, 2) } when ODIN_ARCH == .amd64 { - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 { return cvtsd2si64(a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 { return _mm_cvtsd_si64(a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvttsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 { return cvttsd2si64(a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvttsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 { return _mm_cvttsd_si64(a) } @@ -1013,27 +1013,27 @@ when ODIN_ARCH == .amd64 { _mm_stream_si64 :: #force_inline proc "c" (mem_addr: ^i64, a: i64) { intrinsics.non_temporal_store(mem_addr, a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi64_si128 :: #force_inline proc "c" (a: i64) -> __m128i { return _mm_set_epi64x(0, a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi64x_si128 :: #force_inline proc "c" (a: i64) -> __m128i { return _mm_cvtsi64_si128(a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi128_si64 :: #force_inline proc "c" (a: __m128i) -> i64 { return simd.extract(transmute(i64x2)a, 0) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi128_si64x :: #force_inline proc "c" (a: __m128i) -> i64 { return _mm_cvtsi128_si64(a) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi64_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d { return simd.replace(a, 0, f64(b)) } - @(enable_target_feature="sse2") + @(require_results, enable_target_feature="sse2") _mm_cvtsi64x_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d { return _mm_cvtsi64_sd(a, b) } diff --git a/core/simd/x86/sse3.odin b/core/simd/x86/sse3.odin index 370bfa952..7a3073c18 100644 --- a/core/simd/x86/sse3.odin +++ b/core/simd/x86/sse3.odin @@ -4,47 +4,47 @@ package simd_x86 import "core:intrinsics" import "core:simd" -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_addsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return addsubps(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_addsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { return addsubpd(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_hadd_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { return haddpd(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_hadd_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return haddps(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_hsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { return hsubpd(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_hsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { return hsubps(a, b) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_lddqu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { return transmute(__m128i)lddqu(mem_addr) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_movedup_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { return simd.shuffle(a, a, 0, 0) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_loaddup_pd :: #force_inline proc "c" (mem_addr: [^]f64) -> __m128d { return _mm_load1_pd(mem_addr) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_movehdup_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return simd.shuffle(a, a, 1, 1, 3, 3) } -@(enable_target_feature="sse3") +@(require_results, enable_target_feature="sse3") _mm_moveldup_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return simd.shuffle(a, a, 0, 0, 2, 2) } diff --git a/core/simd/x86/sse41.odin b/core/simd/x86/sse41.odin index 516e0bdc2..b35be33f2 100644 --- a/core/simd/x86/sse41.odin +++ b/core/simd/x86/sse41.odin @@ -20,271 +20,271 @@ _MM_FROUND_NEARBYINT :: _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i { return transmute(__m128i)pblendvb(transmute(i8x16)a, transmute(i8x16)b, transmute(i8x16)mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d { return blendvpd(a, b, mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blendv_ps :: #force_inline proc "c" (a, b, mask: __m128) -> __m128 { return blendvps(a, b, mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blend_pd :: #force_inline proc "c" (a, b: __m128d, $IMM2: u8) -> __m128d { return blendpd(a, b, IMM2) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_blend_ps :: #force_inline proc "c" (a, b: __m128, $IMM4: u8) -> __m128 { return blendps(a, b, IMM4) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_extract_ps :: #force_inline proc "c" (a: __m128, $IMM8: u32) -> i32 { return transmute(i32)simd.extract(a, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_extract_epi8 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { return i32(simd.extract(transmute(u8x16)a, IMM8)) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_extract_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { return simd.extract(transmute(i32x4)a, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_insert_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 { return insertps(a, b, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_insert_epi8 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { return transmute(__m128i)simd.replace(transmute(i8x16)a, IMM8, i8(i)) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_insert_epi32 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { return transmute(__m128i)simd.replace(transmute(i32x4)a, IMM8, i) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_max_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxsb(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_max_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxuw(transmute(u16x8)a, transmute(u16x8)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_max_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxsd(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_max_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaxud(transmute(u32x4)a, transmute(u32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_min_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminsb(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_min_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminuw(transmute(u16x8)a, transmute(u16x8)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_min_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminsd(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_min_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pminud(transmute(u32x4)a, transmute(u32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_packus_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)packusdw(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cmpeq_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.lanes_eq(transmute(i64x2)a, transmute(i64x2)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i8x16)a y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7) return transmute(__m128i)i16x8(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i8x16)a y := simd.shuffle(x, x, 0, 1, 2, 3) return transmute(__m128i)i32x4(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i8x16)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i16x8)a y := simd.shuffle(x, x, 0, 1, 2, 3) return transmute(__m128i)i32x4(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i16x8)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepi32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(i32x4)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u8x16)a y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7) return transmute(__m128i)i16x8(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u8x16)a y := simd.shuffle(x, x, 0, 1, 2, 3) return transmute(__m128i)i32x4(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u8x16)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u16x8)a y := simd.shuffle(x, x, 0, 1, 2, 3) return transmute(__m128i)i32x4(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u16x8)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_cvtepu32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { x := transmute(u32x4)a y := simd.shuffle(x, x, 0, 1) return transmute(__m128i)i64x2(y) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_dp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM8: u8) -> __m128d { return dppd(a, b, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_dp_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 { return dpps(a, b, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_floor_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { return simd.floor(a) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_floor_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return simd.floor(a) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_floor_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return roundsd(a, b, _MM_FROUND_FLOOR) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_floor_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return roundss(a, b, _MM_FROUND_FLOOR) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_ceil_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { return simd.ceil(a) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_ceil_ps :: #force_inline proc "c" (a: __m128) -> __m128 { return simd.ceil(a) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_ceil_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { return roundsd(a, b, _MM_FROUND_CEIL) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_ceil_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { return roundss(a, b, _MM_FROUND_CEIL) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_round_pd :: #force_inline proc "c" (a: __m128d, $ROUNDING: i32) -> __m128d { return roundpd(a, ROUNDING) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_round_ps :: #force_inline proc "c" (a: __m128, $ROUNDING: i32) -> __m128 { return roundps(a, ROUNDING) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_round_sd :: #force_inline proc "c" (a, b: __m128d, $ROUNDING: i32) -> __m128d { return roundsd(a, b, ROUNDING) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_round_ss :: #force_inline proc "c" (a, b: __m128, $ROUNDING: i32) -> __m128 { return roundss(a, b, ROUNDING) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_minpos_epu16 :: #force_inline proc "c" (a: __m128i) -> __m128i { return transmute(__m128i)phminposuw(transmute(u16x8)a) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_mul_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmuldq(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_mullo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)simd.mul(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_mpsadbw_epu8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { return transmute(__m128i)mpsadbw(transmute(u8x16)a, transmute(u8x16)b, IMM8) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_testz_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { return ptestz(transmute(i64x2)a, transmute(i64x2)mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_testc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { return ptestc(transmute(i64x2)a, transmute(i64x2)mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_testnzc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { return ptestnzc(transmute(i64x2)a, transmute(i64x2)mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_test_all_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { return _mm_testz_si128(a, mask) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_test_all_ones :: #force_inline proc "c" (a: __m128i) -> i32 { return _mm_testc_si128(a, _mm_cmpeq_epi32(a, a)) } -@(enable_target_feature="sse4.1") +@(require_results, enable_target_feature="sse4.1") _mm_test_mix_ones_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { return _mm_testnzc_si128(a, mask) } when ODIN_ARCH == .amd64 { - @(enable_target_feature="sse4.1") + @(require_results, enable_target_feature="sse4.1") _mm_extract_epi64 :: #force_inline proc "c" (a: __m128i, $IMM1: u32) -> i64 { return simd.extract(transmute(i64x2)a, IMM1) } - @(enable_target_feature="sse4.1") + @(require_results, enable_target_feature="sse4.1") _mm_insert_epi64 :: #force_inline proc "c" (a: __m128i, i: i64, $IMM1: u32) -> __m128i { return transmute(__m128i)simd.replace(transmute(i64x2)a, IMM1, i) } diff --git a/core/simd/x86/ssse3.odin b/core/simd/x86/ssse3.odin index 8c677aed4..f11ef6774 100644 --- a/core/simd/x86/ssse3.odin +++ b/core/simd/x86/ssse3.odin @@ -5,23 +5,23 @@ import "core:intrinsics" import "core:simd" _ :: simd -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_abs_epi8 :: #force_inline proc "c" (a: __m128i) -> __m128i { return transmute(__m128i)pabsb128(transmute(i8x16)a) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_abs_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { return transmute(__m128i)pabsw128(transmute(i16x8)a) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { return transmute(__m128i)pabsd128(transmute(i32x4)a) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i { shift :: IMM8 @@ -58,47 +58,47 @@ _mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hadd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phaddw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hadds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phaddsw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hadd_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phaddd128(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hsub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phsubw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hsubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phsubsw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_hsub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)phsubd128(transmute(i32x4)a, transmute(i32x4)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_maddubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmaddubsw128(transmute(u8x16)a, transmute(i8x16)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_mulhrs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pmulhrsw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_sign_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)psignb128(transmute(i8x16)a, transmute(i8x16)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_sign_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)psignw128(transmute(i16x8)a, transmute(i16x8)b) } -@(enable_target_feature="ssse3") +@(require_results, enable_target_feature="ssse3") _mm_sign_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)psignd128(transmute(i32x4)a, transmute(i32x4)b) }