| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2009 Intel Corporation |
| * All Rights Reserved. |
| */ |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| /* |
| * Accelerated GHASH implementation with Intel PCLMULQDQ-NI |
| * instructions. This file contains an accelerated |
| * Galois Field Multiplication implementation. |
| * |
| * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, |
| * carry-less multiplication. More information about PCLMULQDQ can be |
| * found at: |
| * http://software.intel.com/en-us/articles/ |
| * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ |
| * |
| */ |
| |
| /* |
| * ==================================================================== |
| * OpenSolaris OS modifications |
| * |
| * This source originates as file galois_hash_asm.c from |
| * Intel Corporation dated September 21, 2009. |
| * |
| * This OpenSolaris version has these major changes from the original source: |
| * |
| * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from |
| * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function |
| * definition for lint. |
| * |
| * 2. Formatted code, added comments, and added #includes and #defines. |
| * |
| * 3. If bit CR0.TS is set, clear and set the TS bit, after and before |
| * calling kpreempt_disable() and kpreempt_enable(). |
| * If the TS bit is not set, Save and restore %xmm registers at the beginning |
| * and end of function calls (%xmm* registers are not saved and restored by |
| * during kernel thread preemption). |
| * |
| * 4. Removed code to perform hashing. This is already done with C macro |
| * GHASH in gcm.c. For better performance, this removed code should be |
| * reintegrated in the future to replace the C GHASH macro. |
| * |
| * 5. Added code to byte swap 16-byte input and output. |
| * |
| * 6. Folded in comments from the original C source with embedded assembly |
| * (SB_w_shift_xor.c) |
| * |
| * 7. Renamed function and reordered parameters to match OpenSolaris: |
| * Intel interface: |
| * void galois_hash_asm(unsigned char *hk, unsigned char *s, |
| * unsigned char *d, int length) |
| * OpenSolaris OS interface: |
| * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); |
| * ==================================================================== |
| */ |
| |
| |
| #if defined(lint) || defined(__lint) /* lint */ |
| |
| #include <sys/types.h> |
| |
| /* ARGSUSED */ |
| void |
| gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { |
| } |
| |
| #elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ |
| |
| #define _ASM |
| #include <sys/asm_linkage.h> |
| |
| /* |
| * Use this mask to byte-swap a 16-byte integer with the pshufb instruction |
| */ |
| |
| // static uint8_t byte_swap16_mask[] = { |
| // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; |
| .data |
| .align XMM_ALIGN |
| .Lbyte_swap16_mask: |
| .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| |
| |
| /* |
| * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); |
| * |
| * Perform a carry-less multiplication (that is, use XOR instead of the |
| * multiply operator) on P1 and P2 and place the result in P3. |
| * |
| * Byte swap the input and the output. |
| * |
| * Note: x_in, y, and res all point to a block of 20-byte numbers |
| * (an array of two 64-bit integers). |
| * |
| * Note2: For kernel code, caller is responsible for ensuring |
| * kpreempt_disable() has been called. This is because %xmm registers are |
| * not saved/restored. Clear and set the CR0.TS bit on entry and exit, |
| * respectively, if TS is set on entry. Otherwise, if TS is not set, |
| * save and restore %xmm registers on the stack. |
| * |
| * Note3: Original Intel definition: |
| * void galois_hash_asm(unsigned char *hk, unsigned char *s, |
| * unsigned char *d, int length) |
| * |
| * Note4: Register/parameter mapping: |
| * Intel: |
| * Parameter 1: %rcx (copied to %xmm0) hk or x_in |
| * Parameter 2: %rdx (copied to %xmm1) s or y |
| * Parameter 3: %rdi (result) d or res |
| * OpenSolaris: |
| * Parameter 1: %rdi (copied to %xmm0) x_in |
| * Parameter 2: %rsi (copied to %xmm1) y |
| * Parameter 3: %rdx (result) res |
| */ |
| |
| ENTRY_NP(gcm_mul_pclmulqdq) |
| // |
| // Copy Parameters |
| // |
| movdqu (%rdi), %xmm0 // P1 |
| movdqu (%rsi), %xmm1 // P2 |
| |
| // |
| // Byte swap 16-byte input |
| // |
| lea .Lbyte_swap16_mask(%rip), %rax |
| movups (%rax), %xmm10 |
| pshufb %xmm10, %xmm0 |
| pshufb %xmm10, %xmm1 |
| |
| |
| // |
| // Multiply with the hash key |
| // |
| movdqu %xmm0, %xmm3 |
| pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 |
| |
| movdqu %xmm0, %xmm4 |
| pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 |
| |
| movdqu %xmm0, %xmm5 |
| pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 |
| movdqu %xmm0, %xmm6 |
| pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 |
| |
| pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 |
| |
| movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 |
| psrldq $8, %xmm4 // shift by xmm4 64 bits to the right |
| pslldq $8, %xmm5 // shift by xmm5 64 bits to the left |
| pxor %xmm5, %xmm3 |
| pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result |
| // of the carry-less multiplication of |
| // xmm0 by xmm1. |
| |
| // We shift the result of the multiplication by one bit position |
| // to the left to cope for the fact that the bits are reversed. |
| movdqu %xmm3, %xmm7 |
| movdqu %xmm6, %xmm8 |
| pslld $1, %xmm3 |
| pslld $1, %xmm6 |
| psrld $31, %xmm7 |
| psrld $31, %xmm8 |
| movdqu %xmm7, %xmm9 |
| pslldq $4, %xmm8 |
| pslldq $4, %xmm7 |
| psrldq $12, %xmm9 |
| por %xmm7, %xmm3 |
| por %xmm8, %xmm6 |
| por %xmm9, %xmm6 |
| |
| // |
| // First phase of the reduction |
| // |
| // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts |
| // independently. |
| movdqu %xmm3, %xmm7 |
| movdqu %xmm3, %xmm8 |
| movdqu %xmm3, %xmm9 |
| pslld $31, %xmm7 // packed right shift shifting << 31 |
| pslld $30, %xmm8 // packed right shift shifting << 30 |
| pslld $25, %xmm9 // packed right shift shifting << 25 |
| pxor %xmm8, %xmm7 // xor the shifted versions |
| pxor %xmm9, %xmm7 |
| movdqu %xmm7, %xmm8 |
| pslldq $12, %xmm7 |
| psrldq $4, %xmm8 |
| pxor %xmm7, %xmm3 // first phase of the reduction complete |
| |
| // |
| // Second phase of the reduction |
| // |
| // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these |
| // shift operations. |
| movdqu %xmm3, %xmm2 |
| movdqu %xmm3, %xmm4 // packed left shifting >> 1 |
| movdqu %xmm3, %xmm5 |
| psrld $1, %xmm2 |
| psrld $2, %xmm4 // packed left shifting >> 2 |
| psrld $7, %xmm5 // packed left shifting >> 7 |
| pxor %xmm4, %xmm2 // xor the shifted versions |
| pxor %xmm5, %xmm2 |
| pxor %xmm8, %xmm2 |
| pxor %xmm2, %xmm3 |
| pxor %xmm3, %xmm6 // the result is in xmm6 |
| |
| // |
| // Byte swap 16-byte result |
| // |
| pshufb %xmm10, %xmm6 // %xmm10 has the swap mask |
| |
| // |
| // Store the result |
| // |
| movdqu %xmm6, (%rdx) // P3 |
| |
| |
| // |
| // Return |
| // |
| RET |
| SET_SIZE(gcm_mul_pclmulqdq) |
| |
| #endif /* lint || __lint */ |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",%progbits |
| #endif |