Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13,926 changes: 13,926 additions & 0 deletions crates/core_arch/src/x86/avx10_2.rs

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -778,3 +778,7 @@ pub use self::kl::*;
mod movrs;
#[unstable(feature = "movrs_target_feature", issue = "137976")]
pub use self::movrs::*;

mod avx10_2;
#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
pub use self::avx10_2::*;
74 changes: 72 additions & 2 deletions crates/core_arch/src/x86/sha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,14 @@ unsafe extern "C" {
fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.vsm4key4256"]
fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.vsm4key4512"]
fn vsm4key4512(a: i32x16, b: i32x16) -> i32x16;
#[link_name = "llvm.x86.vsm4rnds4128"]
fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.vsm4rnds4256"]
fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.vsm4rnds4512"]
fn vsm4rnds4512(a: i32x16, b: i32x16) -> i32x16;
}

#[cfg(test)]
Expand Down Expand Up @@ -252,6 +256,16 @@ pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
}

/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
/// 128-bit lanes. The calculated results are stored in dst.
#[inline]
#[target_feature(enable = "sm4,avx10.2")]
#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vsm4key4))]
#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
pub fn _mm512_sm4key4_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { vsm4key4512(a.as_i32x16(), b.as_i32x16()).as_m512i() }
}

/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
/// 128-bit lanes. The calculated results are stored in dst.
///
Expand All @@ -276,6 +290,16 @@ pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
}

/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
/// 128-bit lanes. The calculated results are stored in dst.
#[inline]
#[target_feature(enable = "sm4,avx10.2")]
#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vsm4rnds4))]
#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
pub fn _mm512_sm4rnds4_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { vsm4rnds4512(a.as_i32x16(), b.as_i32x16()).as_m512i() }
}

#[cfg(test)]
mod tests {
use crate::{
Expand Down Expand Up @@ -475,10 +499,12 @@ mod tests {
assert_eq_m256i(r, e);
}

static DATA_32: [u32; 16] = [
static DATA_32: [u32; 32] = [
0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
0xfdb97531, 0xeca86420,
0xfdb97531, 0xeca86420, 0x048c159d, 0x26ae37bf, 0xfb73ea62, 0xd951c840, 0xabcdef01,
0x23456789, 0x0fedcba9, 0x87654321, 0x10fedcba, 0x98765432, 0x1fdb9753, 0xeca86420,
0x048c159d, 0x26ae37bf, 0xfb73ea62, 0xd951c840,
];

#[simd_test(enable = "sm3,avx")]
Expand Down Expand Up @@ -685,6 +711,31 @@ mod tests {
assert_eq_m256i(r, e);
}

#[inline]
#[target_feature(enable = "avx512f")]
fn _mm512_set_m256i(lo: __m256i, hi: __m256i) -> __m512i {
unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
}

#[simd_test(enable = "sm4,avx10.2")]
fn test_mm512_sm4key4_epi32() {
let a_low = unsafe { _mm256_loadu_si256(DATA_32.as_ptr().cast()) };
let a_high = unsafe { _mm256_loadu_si256(DATA_32[8..].as_ptr().cast()) };
let b_low = unsafe { _mm256_loadu_si256(DATA_32[16..].as_ptr().cast()) };
let b_high = unsafe { _mm256_loadu_si256(DATA_32[24..].as_ptr().cast()) };

let a = _mm512_set_m256i(a_high, a_low);
let b = _mm512_set_m256i(b_high, b_low);

let r = _mm512_sm4key4_epi32(a, b);

let e_low = _mm256_sm4key4_epi32(a_low, b_low);
let e_high = _mm256_sm4key4_epi32(a_high, b_high);
let e = _mm512_set_m256i(e_high, e_low);

assert_eq_m512i(r, e);
}

#[simd_test(enable = "sm4,avx")]
fn test_mm_sm4rnds4_epi32() {
fn l_rnd(x: u32) -> u32 {
Expand Down Expand Up @@ -729,4 +780,23 @@ mod tests {

assert_eq_m256i(r, e);
}

#[simd_test(enable = "sm4,avx10.2")]
fn test_mm512_sm4rnds4_epi32() {
let a_low = unsafe { _mm256_loadu_si256(DATA_32.as_ptr().cast()) };
let a_high = unsafe { _mm256_loadu_si256(DATA_32[8..].as_ptr().cast()) };
let b_low = unsafe { _mm256_loadu_si256(DATA_32[16..].as_ptr().cast()) };
let b_high = unsafe { _mm256_loadu_si256(DATA_32[24..].as_ptr().cast()) };

let a = _mm512_set_m256i(a_high, a_low);
let b = _mm512_set_m256i(b_high, b_low);

let r = _mm512_sm4rnds4_epi32(a, b);

let e_low = _mm256_sm4rnds4_epi32(a_low, b_low);
let e_high = _mm256_sm4rnds4_epi32(a_high, b_high);
let e = _mm512_set_m256i(e_high, e_low);

assert_eq_m512i(r, e);
}
}
Loading