| /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized |
| loop. |
| Copyright (C) 2016-2018 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| |
| /* TODO: change these to the actual instructions when the minimum required |
| binutils allows it. */ |
| #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) |
| #define VBPERMQ(t,a,b) .long (0x1000054c \ |
| | ((t)<<(32-11)) \ |
| | ((a)<<(32-16)) \ |
| | ((b)<<(32-21)) ) |
| |
| /* int [r3] strlen (char *s [r3]) */ |
| |
| #ifndef STRLEN |
| # define STRLEN strlen |
| #endif |
| |
| /* TODO: change this to .machine power8 when the minimum required binutils |
| allows it. */ |
| .machine power7 |
| ENTRY_TOCLESS (STRLEN, 4) |
| CALL_MCOUNT 1 |
| dcbt 0,r3 |
| clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ |
| rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
| li r0,0 /* Doubleword with null chars to use |
| with cmpb. */ |
| li r5,-1 /* MASK = 0xffffffffffffffff. */ |
| ld r12,0(r4) /* Load doubleword from memory. */ |
| #ifdef __LITTLE_ENDIAN__ |
| sld r5,r5,r6 |
| #else |
| srd r5,r5,r6 /* MASK = MASK >> padding. */ |
| #endif |
| orc r9,r12,r5 /* Mask bits that are not part of the string. */ |
| cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ |
| cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
| bne cr7,L(done) |
| |
| /* For shorter strings (< 64 bytes), we will not use vector registers, |
| as the overhead isn't worth it. So, let's use GPRs instead. This |
| will be done the same way as we do in the POWER7 implementation. |
| Let's see if we are aligned to a quadword boundary. If so, we can |
| jump to the first (non-vectorized) loop. Otherwise, we have to |
| handle the next DWORD first. */ |
| mtcrf 0x01,r4 |
| mr r9,r4 |
| addi r9,r9,8 |
| bt 28,L(align64) |
| |
| /* Handle the next 8 bytes so we are aligned to a quadword |
| boundary. */ |
| ldu r5,8(r4) |
| cmpb r10,r5,r0 |
| cmpdi cr7,r10,0 |
| addi r9,r9,8 |
| bne cr7,L(done) |
| |
| L(align64): |
| /* Proceed to the old (POWER7) implementation, checking two doublewords |
| per iteraction. For the first 56 bytes, we will just check for null |
| characters. After that, we will also check if we are 64-byte aligned |
| so we can jump to the vectorized implementation. We will unroll |
| these loops to avoid excessive branching. */ |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| bne cr7,L(dword_zero) |
| |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| bne cr7,L(dword_zero) |
| |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| bne cr7,L(dword_zero) |
| |
| /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
| Note: aligning to 64-byte will necessarily slow down performance for |
| strings around 64 bytes in length due to the extra comparisons |
| required to check alignment for the vectorized loop. This is a |
| necessary tradeoff we are willing to take in order to speed up the |
| calculation for larger strings. */ |
| andi. r10,r9,63 |
| beq cr0,L(preloop) |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| bne cr7,L(dword_zero) |
| |
| andi. r10,r9,63 |
| beq cr0,L(preloop) |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| bne cr7,L(dword_zero) |
| |
| andi. r10,r9,63 |
| beq cr0,L(preloop) |
| ld r6,8(r4) |
| ldu r5,16(r4) |
| cmpb r10,r6,r0 |
| cmpb r11,r5,r0 |
| or r5,r10,r11 |
| cmpdi cr7,r5,0 |
| addi r9,r9,16 |
| |
| /* At this point, we are necessarily 64-byte aligned. If no zeroes were |
| found, jump to the vectorized loop. */ |
| beq cr7,L(preloop) |
| |
| L(dword_zero): |
| /* OK, one (or both) of the doublewords contains a null byte. Check |
| the first doubleword and decrement the address in case the first |
| doubleword really contains a null byte. */ |
| |
| cmpdi cr6,r10,0 |
| addi r4,r4,-8 |
| bne cr6,L(done) |
| |
| /* The null byte must be in the second doubleword. Adjust the address |
| again and move the result of cmpb to r10 so we can calculate the |
| length. */ |
| |
| mr r10,r11 |
| addi r4,r4,8 |
| |
| /* If the null byte was found in the non-vectorized code, compute the |
| final length. r10 has the output of the cmpb instruction, that is, |
| it contains 0xff in the same position as the null byte in the |
| original doubleword from the string. Use that to calculate the |
| length. */ |
| L(done): |
| #ifdef __LITTLE_ENDIAN__ |
| addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| andc r9, r9,r10 |
| popcntd r0, r9 /* Count the bits in the mask. */ |
| #else |
| cntlzd r0,r10 /* Count leading zeros before the match. */ |
| #endif |
| subf r5,r3,r4 |
| srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
| add r3,r5,r0 /* Compute final length. */ |
| blr |
| |
| /* Vectorized implementation starts here. */ |
| .p2align 4 |
| L(preloop): |
| /* Set up for the loop. */ |
| mr r4,r9 |
| li r7, 16 /* Load required offsets. */ |
| li r8, 32 |
| li r9, 48 |
| li r12, 8 |
| vxor v0,v0,v0 /* VR with null chars to use with |
| vcmpequb. */ |
| |
| /* Main loop to look for the end of the string. We will read in |
| 64-byte chunks. Align it to 32 bytes and unroll it 3 times to |
| leverage the icache performance. */ |
| .p2align 5 |
| L(loop): |
| lvx v1,r4,r0 /* Load 4 quadwords. */ |
| lvx v2,r4,r7 |
| lvx v3,r4,r8 |
| lvx v4,r4,r9 |
| vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| vminub v6,v3,v4 |
| vminub v7,v5,v6 |
| vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| addi r4,r4,64 /* Adjust address for the next iteration. */ |
| bne cr6,L(vmx_zero) |
| |
| lvx v1,r4,r0 /* Load 4 quadwords. */ |
| lvx v2,r4,r7 |
| lvx v3,r4,r8 |
| lvx v4,r4,r9 |
| vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| vminub v6,v3,v4 |
| vminub v7,v5,v6 |
| vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| addi r4,r4,64 /* Adjust address for the next iteration. */ |
| bne cr6,L(vmx_zero) |
| |
| lvx v1,r4,r0 /* Load 4 quadwords. */ |
| lvx v2,r4,r7 |
| lvx v3,r4,r8 |
| lvx v4,r4,r9 |
| vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| vminub v6,v3,v4 |
| vminub v7,v5,v6 |
| vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| addi r4,r4,64 /* Adjust address for the next iteration. */ |
| beq cr6,L(loop) |
| |
| L(vmx_zero): |
| /* OK, we found a null byte. Let's look for it in the current 64-byte |
| block and mark it in its corresponding VR. */ |
| vcmpequb v1,v1,v0 |
| vcmpequb v2,v2,v0 |
| vcmpequb v3,v3,v0 |
| vcmpequb v4,v4,v0 |
| |
| /* We will now 'compress' the result into a single doubleword, so it |
| can be moved to a GPR for the final calculation. First, we |
| generate an appropriate mask for vbpermq, so we can permute bits into |
| the first halfword. */ |
| vspltisb v10,3 |
| lvsl v11,r0,r0 |
| vslb v10,v11,v10 |
| |
| /* Permute the first bit of each byte into bits 48-63. */ |
| VBPERMQ(v1,v1,v10) |
| VBPERMQ(v2,v2,v10) |
| VBPERMQ(v3,v3,v10) |
| VBPERMQ(v4,v4,v10) |
| |
| /* Shift each component into its correct position for merging. */ |
| #ifdef __LITTLE_ENDIAN__ |
| vsldoi v2,v2,v2,2 |
| vsldoi v3,v3,v3,4 |
| vsldoi v4,v4,v4,6 |
| #else |
| vsldoi v1,v1,v1,6 |
| vsldoi v2,v2,v2,4 |
| vsldoi v3,v3,v3,2 |
| #endif |
| |
| /* Merge the results and move to a GPR. */ |
| vor v1,v2,v1 |
| vor v2,v3,v4 |
| vor v4,v1,v2 |
| MFVRD(r10,v4) |
| |
| /* Adjust address to the begninning of the current 64-byte block. */ |
| addi r4,r4,-64 |
| |
| #ifdef __LITTLE_ENDIAN__ |
| addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| andc r9, r9,r10 |
| popcntd r0, r9 /* Count the bits in the mask. */ |
| #else |
| cntlzd r0,r10 /* Count leading zeros before the match. */ |
| #endif |
| subf r5,r3,r4 |
| add r3,r5,r0 /* Compute final length. */ |
| blr |
| |
| END (STRLEN) |
| libc_hidden_builtin_def (strlen) |