From 20fe6d102a8469eac180bc0051c5359acad07ac1 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Fri, 27 May 2022 16:58:35 +0100 Subject: [PATCH] Add load and stores and sets --- core/simd/x86/sse2.odin | 75 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin index 970704bb7..bba842820 100644 --- a/core/simd/x86/sse2.odin +++ b/core/simd/x86/sse2.odin @@ -1,6 +1,7 @@ //+build i386, amd64 package simd_x86 +import "core:intrinsics" import "core:simd" _mm_pause :: #force_inline proc "c" () { @@ -287,6 +288,80 @@ _mm_cvtsi128_si32 :: #force_inline proc "c" (a: __m128i) -> i32 { +_mm_set_epi64x :: #force_inline proc "c" (e1, e0: i64) -> __m128i { + return transmute(__m128i)i64x2{e0, e1} +} +_mm_set_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { + return transmute(__m128i)i32x4{e0, e1, e2, e3} +} +_mm_set_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { + return transmute(__m128i)i16x8{e0, e1, e2, e3, e4, e5, e6, e7} +} +_mm_set_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { + return transmute(__m128i)i8x16{e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15} +} +_mm_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m128i { + return _mm_set_epi64x(a, a) +} +_mm_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m128i { + return _mm_set_epi32(a, a, a, a) +} +_mm_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m128i { + return _mm_set_epi16(a, a, a, a, a, a, a, a) +} +_mm_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m128i { + return _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} +_mm_setr_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { + return _mm_set_epi32(e0, e1, e2, e3) +} +_mm_setr_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { + return _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7) +} +_mm_setr_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { + return _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) +} +_mm_setzero_si128 :: #force_inline proc "c" () -> __m128i { + return _mm_set1_epi64x(0) +} + + +_mm_loadl_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + return _mm_set_epi64x(0, intrinsics.unaligned_load((^i64)(mem_addr))) +} +_mm_load_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + return mem_addr^ +} +_mm_loadu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + dst := _mm_undefined_si128() + intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128i)) + return dst +} +_mm_maskmoveu_si128 :: #force_inline proc "c" (a, mask: __m128i, mem_addr: rawptr) { + maskmovdqu(transmute(i8x16)a, transmute(i8x16)mask, mem_addr) +} +_mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + mem_addr^ = a +} +_mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + storeudq(mem_addr, a) +} +_mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + a := a + intrinsics.mem_copy_non_overlapping(mem_addr, &a, 8) +} +_mm_stream_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + intrinsics.nontemporal_store(mem_addr, a) +} +_mm_stream_si32 :: #force_inline proc "c" (mem_addr: ^i32, a: i32) { + intrinsics.nontemporal_store(mem_addr, a) +} +_mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + zero := _mm_setzero_si128() + return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)zero, 0, 2) +} + +