| /* Optimized strnlen implementation for POWER8 using a vmx loop. |
| |
| Copyright (C) 2017-2018 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* It is implemented the following heuristic: |
| 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through |
| reading doublewords. Uses the POWER7 algorithm. |
| 2. Case maxlen > 32: check for null bytes in the first 16 bytes using |
| unaligned accesses. Return length if found. Otherwise: |
| 2.1 Case maxlen < 64: deduct the bytes previously read, align |
| the pointer to 16 bytes and loop through reading quadwords |
| until find null bytes or reach maxlen. |
| 2.2 Case maxlen > 64: deduct the bytes previously read, align |
| the pointer to 64 bytes and set up a counter to loop through |
| reading in strides of 64 bytes. In case it finished the loop |
| with null bytes not found, process the remainder bytes by |
| switching to the loop to heuristic in 2.1. */ |
| |
| #include <sysdep.h> |
| |
| /* Define default page size to 4KB. */ |
| #define PAGE_SIZE 4096 |
| |
| /* The following macros implement Power ISA v2.07 opcodes |
| that could not be used directly into this code to the keep |
| compatibility with older binutils versions. */ |
| |
| /* Move from vector register doubleword. */ |
| #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) |
| |
| /* Move to vector register doubleword. */ |
| #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) |
| |
| /* Vector Bit Permute Quadword. */ |
| #define VBPERMQ(t,a,b) .long (0x1000054c \ |
| | ((t)<<(32-11)) \ |
| | ((a)<<(32-16)) \ |
| | ((b)<<(32-21)) ) |
| |
| /* Vector Population Count Halfword. */ |
| #define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21))) |
| |
| /* Vector Count Leading Zeros Halfword. */ |
| #define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21))) |
| |
| |
| /* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */ |
| /* TODO: change to power8 when minimum required binutils allows it. */ |
| .machine power7 |
| ENTRY_TOCLESS (__strnlen) |
| CALL_MCOUNT 2 |
| dcbt 0,r3 |
| |
| cmpldi r4,32 /* Check if maxlen <= 32. */ |
| ble L(small_range) /* If maxlen <= 32. */ |
| |
| /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary |
| otherwise the processor throws an memory access error. |
| Use following code to check there is room for such as accesses: |
| (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16) |
| If it is disallowed then switch to the code that handles |
| the string when maxlen <= 32. */ |
| clrldi r10,r3,52 |
| cmpldi cr7,r10,PAGE_SIZE-16 |
| bgt cr7,L(small_range) /* If less than 16B of page end. */ |
| |
| /* Compute our permute constant r8. */ |
| li r7,0 |
| /* Compute a bpermd constant to move bit 0 of each word into |
| a halfword value, and count trailing zeros. */ |
| #ifdef __LITTLE_ENDIAN__ |
| li r8,0x2820 |
| oris r8,r8,0x3830 |
| sldi r8,r8,32 |
| ori r8,r8,0x0800 |
| oris r8,r8,0x1810 |
| #else |
| li r8,0x1018 |
| oris r8,r8,0x0008 |
| sldi r8,r8,32 |
| ori r8,r8,0x3038 |
| oris r8,r8,0x2028 |
| #endif |
| |
| /* maxlen > 32. Optimistically check for null bytes in the first |
| 16 bytes of the string using unaligned accesses. */ |
| ld r5,0(r3) |
| ld r6,8(r3) |
| cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */ |
| cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */ |
| or. r7,r10,r11 |
| bne cr0, L(early_find) /* If found null bytes. */ |
| |
| /* At this point maxlen > 32 and null bytes were not found at first |
| 16 bytes. Prepare for loop using VMX. */ |
| |
| /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */ |
| |
| addi r5,r3,16 /* Align up, or just add the 16B we |
| already checked. */ |
| li r0,15 |
| and r7,r5,r0 /* Find offset into 16B alignment. */ |
| andc r5,r5,r0 /* Quadword align up s to the next quadword. */ |
| li r0,16 |
| subf r0,r7,r0 |
| subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */ |
| |
| |
| /* Compute offsets for vmx loads, and precompute the vbpermq |
| constants for both the 64B and 16B loops. */ |
| li r6,0 |
| vspltisb v0,0 |
| vspltisb v10,3 |
| lvsl v11,r6,r6 |
| vslb v10,v11,v10 |
| |
| cmpldi r4,64 /* Check maxlen < 64. */ |
| blt L(smaller) /* If maxlen < 64 */ |
| |
| /* In order to begin the 64B loop, it needs to be 64 |
| bytes aligned. So read quadwords until it is aligned or found null |
| bytes. At worst case it will be aligned after the fourth iteration, |
| so unroll the loop to avoid counter checking. */ |
| andi. r7,r5,63 /* Check if is 64 bytes aligned. */ |
| beq cr0,L(preloop_64B) /* If it is already 64B aligned. */ |
| lvx v1,r5,r6 |
| vcmpequb. v1,v1,v0 |
| addi r5,r5,16 |
| addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */ |
| bne cr6,L(found_aligning64B) /* If found null bytes. */ |
| |
| /* Unroll 2x above code block until aligned or find null bytes. */ |
| andi. r7,r5,63 |
| beq cr0,L(preloop_64B) |
| lvx v1,r5,r6 |
| vcmpequb. v1,v1,v0 |
| addi r5,r5,16 |
| addi r4,r4,-16 |
| bne cr6,L(found_aligning64B) |
| |
| andi. r7,r5,63 |
| beq cr0,L(preloop_64B) |
| lvx v1,r5,r6 |
| vcmpequb. v1,v1,v0 |
| addi r5,r5,16 |
| addi r4,r4,-16 |
| bne cr6,L(found_aligning64B) |
| |
| /* At this point it should be 16 bytes aligned. |
| Prepare for the 64B loop. */ |
| .p2align 4 |
| L(preloop_64B): |
| /* Check if maxlen became is less than 64, therefore disallowing the |
| 64B loop. If it happened switch to the 16B loop code. */ |
| cmpldi r4,64 /* Check if maxlen < 64. */ |
| blt L(smaller) /* If maxlen < 64. */ |
| /* Set some constant values. */ |
| li r7,16 |
| li r10,32 |
| li r9,48 |
| |
| /* Compute the number of 64 bytes iterations needed. */ |
| srdi r11,r4,6 /* Compute loop count (maxlen / 64). */ |
| andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */ |
| mtctr r11 /* Move loop count to counter register. */ |
| |
| /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */ |
| .p2align 4 |
| L(loop_64B): |
| lvx v1,r5,r6 /* r5 is the pointer to s. */ |
| lvx v2,r5,r7 |
| lvx v3,r5,r10 |
| lvx v4,r5,r9 |
| /* Compare the four 16B vectors to obtain the least 16 values. |
| Null bytes should emerge into v7, then check for null bytes. */ |
| vminub v5,v1,v2 |
| vminub v6,v3,v4 |
| vminub v7,v5,v6 |
| vcmpequb. v7,v7,v0 /* Check for null bytes. */ |
| addi r5,r5,64 /* Add pointer to next iteraction. */ |
| bne cr6,L(found_64B) /* If found null bytes. */ |
| bdnz L(loop_64B) /* Continue the loop if count > 0. */ |
| |
| /* Hit loop end without null match. So branch to handle the remainder. */ |
| |
| /* Prepare a 16B loop to handle two cases: |
| 1. If 32 > maxlen < 64. |
| 2. If maxlen >= 64, and reached end of the 64B loop with null |
| bytes not found. Thus handle the remainder bytes here. */ |
| .p2align 4 |
| L(smaller): |
| cmpldi r4,0 /* Check maxlen is zero. */ |
| beq L(done) /* If maxlen is zero. */ |
| |
| /* Place rounded up number of qw's to check into a vmx |
| register, and use some vector tricks to minimize |
| branching. */ |
| MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */ |
| vspltisb v5,1 |
| vspltisb v6,15 |
| vspltb v2,v7,7 |
| vaddubs v3,v5,v6 |
| |
| #ifdef __LITTLE_ENDIAN__ |
| vspltish v5,1 /* Compute 16 in each byte. */ |
| #endif |
| |
| /* Loop in 16B aligned incremements now. */ |
| .p2align 4 |
| L(loop_16B): |
| lvx v1,r5,r6 /* Load quadword into vector register. */ |
| addi r5,r5,16 /* Increment address to next 16B block. */ |
| vor v7,v2,v2 /* Save loop count (v2) into v7. */ |
| vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */ |
| vminub v4,v1,v2 |
| vcmpequb. v4,v4,v0 /* Checking for null bytes. */ |
| beq cr6,L(loop_16B) /* If null bytes not found. */ |
| |
| vcmpequb v1,v1,v0 |
| VBPERMQ(v1,v1,v10) |
| #ifdef __LITTLE_ENDIAN__ |
| vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */ |
| vandc v2,v2,v1 |
| VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */ |
| #else |
| VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */ |
| #endif |
| /* Truncate to maximum allowable offset. */ |
| vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond |
| maxlen. */ |
| vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */ |
| |
| MFVRD(r0,v1) |
| addi r5,r5,-16 /* Undo speculative bump. */ |
| extsb r0,r0 /* Clear whatever gunk is in the high 56b. */ |
| add r5,r5,r0 /* Add the offset of whatever was found. */ |
| L(done): |
| subf r3,r3,r5 /* Length is equal to the offset of null byte |
| matched minus the pointer to s. */ |
| blr /* Done. */ |
| |
| /* Handle case of maxlen > 64 and found null bytes in last block |
| of 64 bytes read. */ |
| .p2align 4 |
| L(found_64B): |
| /* A zero was found. Reduce the result. */ |
| vcmpequb v1,v1,v0 |
| vcmpequb v2,v2,v0 |
| vcmpequb v3,v3,v0 |
| vcmpequb v4,v4,v0 |
| |
| /* Permute the first bit of each byte into bits 48-63. */ |
| VBPERMQ(v1,v1,v10) |
| VBPERMQ(v2,v2,v10) |
| VBPERMQ(v3,v3,v10) |
| VBPERMQ(v4,v4,v10) |
| |
| /* Shift each component into its correct position for merging. */ |
| #ifdef __LITTLE_ENDIAN__ |
| vsldoi v2,v2,v2,2 |
| vsldoi v3,v3,v3,4 |
| vsldoi v4,v4,v4,6 |
| #else |
| vsldoi v1,v1,v1,6 |
| vsldoi v2,v2,v2,4 |
| vsldoi v3,v3,v3,2 |
| #endif |
| |
| /* Merge the results and move to a GPR. */ |
| vor v1,v2,v1 |
| vor v2,v3,v4 |
| vor v4,v1,v2 |
| |
| /* Adjust address to the start of the current 64B block. */ |
| addi r5,r5,-64 |
| |
| MFVRD(r10,v4) |
| #ifdef __LITTLE_ENDIAN__ |
| addi r9,r10,-1 /* Form a mask from trailing zeros. */ |
| andc r9,r9,r10 |
| popcntd r0,r9 /* Count the bits in the mask. */ |
| #else |
| cntlzd r0,r10 /* Count leading zeros before the match. */ |
| #endif |
| subf r5,r3,r5 |
| add r3,r5,r0 /* Compute final length. */ |
| blr /* Done. */ |
| |
| /* Handle case where null bytes were found while aligning |
| as a preparation for the 64B loop. */ |
| .p2align 4 |
| L(found_aligning64B): |
| VBPERMQ(v1,v1,v10) |
| #ifdef __LITTLE_ENDIAN__ |
| MFVRD(r10,v1) |
| addi r9,r10,-1 /* Form a mask from trailing zeros. */ |
| andc r9,r9,r10 |
| popcntd r0,r9 /* Count the bits in the mask. */ |
| #else |
| vsldoi v1,v1,v1,6 |
| MFVRD(r10,v1) |
| cntlzd r0,r10 /* Count leading zeros before the match. */ |
| #endif |
| addi r5,r5,-16 /* Adjust address to offset of last 16 bytes |
| read. */ |
| /* Calculate length as subtracted the pointer to s of last 16 bytes |
| offset, added with the bytes before the match. */ |
| subf r5,r3,r5 |
| add r3,r5,r0 |
| blr /* Done. */ |
| |
| /* Handle case of maxlen > 32 and found a null bytes within the first |
| 16 bytes of s. */ |
| .p2align 4 |
| L(early_find): |
| bpermd r5,r8,r10 /* r8 contains the bit permute constants. */ |
| bpermd r6,r8,r11 |
| sldi r5,r5,8 |
| or r5,r5,r6 /* r5 should hold a 16B mask of |
| a potential 0. */ |
| cntlzd r5,r5 /* Count leading zeros. */ |
| addi r3,r5,-48 /* Deduct the 48 leading zeros always |
| present. */ |
| blr /* Done. */ |
| |
| /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */ |
| .p2align 4 |
| L(small_range): |
| clrrdi r8,r3,3 /* Align the pointer to 8B. */ |
| li r0,0 |
| /* Register's content at this point: |
| r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B, |
| r7 == last acceptable address. */ |
| cmpldi r4,0 /* Check if maxlen is zero. */ |
| beq L(end_max) /* If maxlen is zero. */ |
| |
| /* Calculate the last acceptable address and check for possible |
| addition overflow by using satured math: |
| r7 = r3 + r4 |
| r7 |= -(r7 < x) */ |
| add r7,r3,r4 |
| subfc r6,r3,r7 |
| subfe r9,r9,r9 |
| extsw r6,r9 |
| or r7,r7,r6 |
| addi r7,r7,-1 |
| |
| clrrdi r7,r7,3 /* Align to 8B address of last |
| acceptable address. */ |
| |
| rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
| ld r12,0(r8) /* Load aligned doubleword. */ |
| cmpb r10,r12,r0 /* Check for null bytes. */ |
| #ifdef __LITTLE_ENDIAN__ |
| srd r10,r10,r6 |
| sld r10,r10,r6 |
| #else |
| sld r10,r10,r6 |
| srd r10,r10,r6 |
| #endif /* __LITTLE_ENDIAN__ */ |
| cmpldi cr7,r10,0 |
| bne cr7,L(done_small) /* If found null byte. */ |
| |
| cmpld r8,r7 /* Check if reached maxlen. */ |
| beq L(end_max) /* If reached maxlen. */ |
| |
| /* Still handling case of maxlen <= 32. Read doubleword aligned until |
| find null bytes or reach maxlen. */ |
| .p2align 4 |
| L(loop_small): |
| ldu r12,8(r8) /* Load next doubleword and update r8. */ |
| cmpb r10,r12,r0 /* Check for null bytes. */ |
| cmpldi cr6,r10,0 |
| bne cr6,L(done_small) /* If found null bytes. */ |
| cmpld r8,r7 /* Check if reached maxlen. */ |
| bne L(loop_small) /* If it has more bytes to read. */ |
| mr r3,r4 /* Reached maxlen with null bytes not found. |
| Length is equal to maxlen. */ |
| blr /* Done. */ |
| |
| /* Still handling case of maxlen <= 32. Found null bytes. |
| Registers: r10 == match bits within doubleword, r8 == address of |
| last doubleword read, r3 == pointer to s, r4 == maxlen. */ |
| .p2align 4 |
| L(done_small): |
| #ifdef __LITTLE_ENDIAN__ |
| /* Count trailing zeros. */ |
| addi r0,r10,-1 |
| andc r0,r0,r10 |
| popcntd r0,r0 |
| #else |
| cntlzd r0,r10 /* Count leading zeros before the match. */ |
| #endif |
| sub r3,r8,r3 /* Calculate total of bytes before the match. */ |
| srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
| add r3,r3,r0 /* Length until the match. */ |
| cmpld r3,r4 /* Check length is greater than maxlen. */ |
| blelr |
| mr r3,r4 /* If length is greater than maxlen, return |
| maxlen. */ |
| blr |
| |
| /* Handle case of reached maxlen with null bytes not found. */ |
| .p2align 4 |
| L(end_max): |
| mr r3,r4 /* Length is equal to maxlen. */ |
| blr /* Done. */ |
| |
| |
| END (__strnlen) |
| libc_hidden_def (__strnlen) |
| weak_alias (__strnlen, strnlen) |
| libc_hidden_def (strnlen) |