339 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			339 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0-only
 | |
| /*
 | |
|  * arch/arm64/lib/xor-neon.c
 | |
|  *
 | |
|  * Authors: Jackie Liu <liuyun01@kylinos.cn>
 | |
|  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
 | |
|  */
 | |
| 
 | |
| #include <linux/raid/xor.h>
 | |
| #include <linux/module.h>
 | |
| #include <asm/neon-intrinsics.h>
 | |
| 
 | |
| void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 */
 | |
| 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 | |
| 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 | |
| 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 | |
| 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 +  0, v0);
 | |
| 		vst1q_u64(dp1 +  2, v1);
 | |
| 		vst1q_u64(dp1 +  4, v2);
 | |
| 		vst1q_u64(dp1 +  6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 */
 | |
| 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 | |
| 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 | |
| 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 | |
| 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 | |
| 
 | |
| 		/* p1 ^= p3 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 +  0, v0);
 | |
| 		vst1q_u64(dp1 +  2, v1);
 | |
| 		vst1q_u64(dp1 +  4, v2);
 | |
| 		vst1q_u64(dp1 +  6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3,
 | |
| 	const unsigned long * __restrict p4)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 	uint64_t *dp4 = (uint64_t *)p4;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 */
 | |
| 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 | |
| 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 | |
| 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 | |
| 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 | |
| 
 | |
| 		/* p1 ^= p3 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
 | |
| 
 | |
| 		/* p1 ^= p4 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 +  0, v0);
 | |
| 		vst1q_u64(dp1 +  2, v1);
 | |
| 		vst1q_u64(dp1 +  4, v2);
 | |
| 		vst1q_u64(dp1 +  6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 		dp4 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3,
 | |
| 	const unsigned long * __restrict p4,
 | |
| 	const unsigned long * __restrict p5)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 	uint64_t *dp4 = (uint64_t *)p4;
 | |
| 	uint64_t *dp5 = (uint64_t *)p5;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 */
 | |
| 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 | |
| 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 | |
| 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 | |
| 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 | |
| 
 | |
| 		/* p1 ^= p3 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
 | |
| 
 | |
| 		/* p1 ^= p4 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 | |
| 
 | |
| 		/* p1 ^= p5 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 +  0, v0);
 | |
| 		vst1q_u64(dp1 +  2, v1);
 | |
| 		vst1q_u64(dp1 +  4, v2);
 | |
| 		vst1q_u64(dp1 +  6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 		dp4 += 8;
 | |
| 		dp5 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| struct xor_block_template xor_block_inner_neon __ro_after_init = {
 | |
| 	.name	= "__inner_neon__",
 | |
| 	.do_2	= xor_arm64_neon_2,
 | |
| 	.do_3	= xor_arm64_neon_3,
 | |
| 	.do_4	= xor_arm64_neon_4,
 | |
| 	.do_5	= xor_arm64_neon_5,
 | |
| };
 | |
| EXPORT_SYMBOL(xor_block_inner_neon);
 | |
| 
 | |
| static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 | |
| {
 | |
| 	uint64x2_t res;
 | |
| 
 | |
| 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
 | |
| 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
 | |
| 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
 | |
| 	return res;
 | |
| }
 | |
| 
 | |
| static void xor_arm64_eor3_3(unsigned long bytes,
 | |
| 	unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 ^ p3 */
 | |
| 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
 | |
| 			  vld1q_u64(dp3 + 0));
 | |
| 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
 | |
| 			  vld1q_u64(dp3 + 2));
 | |
| 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
 | |
| 			  vld1q_u64(dp3 + 4));
 | |
| 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
 | |
| 			  vld1q_u64(dp3 + 6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 + 0, v0);
 | |
| 		vst1q_u64(dp1 + 2, v1);
 | |
| 		vst1q_u64(dp1 + 4, v2);
 | |
| 		vst1q_u64(dp1 + 6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| static void xor_arm64_eor3_4(unsigned long bytes,
 | |
| 	unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3,
 | |
| 	const unsigned long * __restrict p4)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 	uint64_t *dp4 = (uint64_t *)p4;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 ^ p3 */
 | |
| 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
 | |
| 			  vld1q_u64(dp3 + 0));
 | |
| 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
 | |
| 			  vld1q_u64(dp3 + 2));
 | |
| 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
 | |
| 			  vld1q_u64(dp3 + 4));
 | |
| 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
 | |
| 			  vld1q_u64(dp3 + 6));
 | |
| 
 | |
| 		/* p1 ^= p4 */
 | |
| 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
 | |
| 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
 | |
| 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
 | |
| 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 + 0, v0);
 | |
| 		vst1q_u64(dp1 + 2, v1);
 | |
| 		vst1q_u64(dp1 + 4, v2);
 | |
| 		vst1q_u64(dp1 + 6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 		dp4 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| static void xor_arm64_eor3_5(unsigned long bytes,
 | |
| 	unsigned long * __restrict p1,
 | |
| 	const unsigned long * __restrict p2,
 | |
| 	const unsigned long * __restrict p3,
 | |
| 	const unsigned long * __restrict p4,
 | |
| 	const unsigned long * __restrict p5)
 | |
| {
 | |
| 	uint64_t *dp1 = (uint64_t *)p1;
 | |
| 	uint64_t *dp2 = (uint64_t *)p2;
 | |
| 	uint64_t *dp3 = (uint64_t *)p3;
 | |
| 	uint64_t *dp4 = (uint64_t *)p4;
 | |
| 	uint64_t *dp5 = (uint64_t *)p5;
 | |
| 
 | |
| 	register uint64x2_t v0, v1, v2, v3;
 | |
| 	long lines = bytes / (sizeof(uint64x2_t) * 4);
 | |
| 
 | |
| 	do {
 | |
| 		/* p1 ^= p2 ^ p3 */
 | |
| 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
 | |
| 			  vld1q_u64(dp3 + 0));
 | |
| 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
 | |
| 			  vld1q_u64(dp3 + 2));
 | |
| 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
 | |
| 			  vld1q_u64(dp3 + 4));
 | |
| 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
 | |
| 			  vld1q_u64(dp3 + 6));
 | |
| 
 | |
| 		/* p1 ^= p4 ^ p5 */
 | |
| 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
 | |
| 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
 | |
| 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
 | |
| 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
 | |
| 
 | |
| 		/* store */
 | |
| 		vst1q_u64(dp1 + 0, v0);
 | |
| 		vst1q_u64(dp1 + 2, v1);
 | |
| 		vst1q_u64(dp1 + 4, v2);
 | |
| 		vst1q_u64(dp1 + 6, v3);
 | |
| 
 | |
| 		dp1 += 8;
 | |
| 		dp2 += 8;
 | |
| 		dp3 += 8;
 | |
| 		dp4 += 8;
 | |
| 		dp5 += 8;
 | |
| 	} while (--lines > 0);
 | |
| }
 | |
| 
 | |
| static int __init xor_neon_init(void)
 | |
| {
 | |
| 	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
 | |
| 		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
 | |
| 		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
 | |
| 		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
 | |
| 	}
 | |
| 	return 0;
 | |
| }
 | |
| module_init(xor_neon_init);
 | |
| 
 | |
| static void __exit xor_neon_exit(void)
 | |
| {
 | |
| }
 | |
| module_exit(xor_neon_exit);
 | |
| 
 | |
| MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 | |
| MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 | |
| MODULE_LICENSE("GPL");
 |