| /* Optimized strcasecmp implementation for PowerPC64. |
| Copyright (C) 2016-2018 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| #include <locale-defines.h> |
| |
| /* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */ |
| |
| #ifndef USE_AS_STRNCASECMP |
| # define __STRCASECMP __strcasecmp |
| # define STRCASECMP strcasecmp |
| #else |
| # define __STRCASECMP __strncasecmp |
| # define STRCASECMP strncasecmp |
| #endif |
| /* Convert 16 bytes to lowercase and compare */ |
| #define TOLOWER() \ |
| vaddubm v8, v4, v1; \ |
| vaddubm v7, v4, v3; \ |
| vcmpgtub v8, v8, v2; \ |
| vsel v4, v7, v4, v8; \ |
| vaddubm v8, v5, v1; \ |
| vaddubm v7, v5, v3; \ |
| vcmpgtub v8, v8, v2; \ |
| vsel v5, v7, v5, v8; \ |
| vcmpequb. v7, v5, v4; |
| |
| /* |
| * Get 16 bytes for unaligned case. |
| * reg1: Vector to hold next 16 bytes. |
| * reg2: Address to read from. |
| * reg3: Permute control vector. |
| * v8: Tmp vector used to mask unwanted bytes. |
| * v9: Tmp vector,0 when null is found on first 16 bytes |
| */ |
| #ifdef __LITTLE_ENDIAN__ |
| #define GET16BYTES(reg1, reg2, reg3) \ |
| lvx reg1, 0, reg2; \ |
| vspltisb v8, -1; \ |
| vperm v8, v8, reg1, reg3; \ |
| vcmpequb. v8, v0, v8; \ |
| beq cr6, 1f; \ |
| vspltisb v9, 0; \ |
| b 2f; \ |
| .align 4; \ |
| 1: \ |
| addi r6, reg2, 16; \ |
| lvx v9, 0, r6; \ |
| 2: \ |
| vperm reg1, v9, reg1, reg3; |
| #else |
| #define GET16BYTES(reg1, reg2, reg3) \ |
| lvx reg1, 0, reg2; \ |
| vspltisb v8, -1; \ |
| vperm v8, reg1, v8, reg3; \ |
| vcmpequb. v8, v0, v8; \ |
| beq cr6, 1f; \ |
| vspltisb v9, 0; \ |
| b 2f; \ |
| .align 4; \ |
| 1: \ |
| addi r6, reg2, 16; \ |
| lvx v9, 0, r6; \ |
| 2: \ |
| vperm reg1, reg1, v9, reg3; |
| #endif |
| |
| /* Check null in v4, v5 and convert to lower. */ |
| #define CHECKNULLANDCONVERT() \ |
| vcmpequb. v7, v0, v5; \ |
| beq cr6, 3f; \ |
| vcmpequb. v7, v0, v4; \ |
| beq cr6, 3f; \ |
| b L(null_found); \ |
| .align 4; \ |
| 3: \ |
| TOLOWER() |
| |
| #ifdef _ARCH_PWR8 |
| # define VCLZD_V8_v7 vclzd v8, v7; |
| # define MFVRD_R3_V1 mfvrd r3, v1; |
| # define VSUBUDM_V9_V8 vsubudm v9, v9, v8; |
| # define VPOPCNTD_V8_V8 vpopcntd v8, v8; |
| # define VADDUQM_V7_V8 vadduqm v9, v7, v8; |
| #else |
| # define VCLZD_V8_v7 .long 0x11003fc2 |
| # define MFVRD_R3_V1 .long 0x7c230067 |
| # define VSUBUDM_V9_V8 .long 0x112944c0 |
| # define VPOPCNTD_V8_V8 .long 0x110047c3 |
| # define VADDUQM_V7_V8 .long 0x11274100 |
| #endif |
| |
| .machine power7 |
| |
| ENTRY (__STRCASECMP) |
| #ifdef USE_AS_STRNCASECMP |
| CALL_MCOUNT 3 |
| #else |
| CALL_MCOUNT 2 |
| #endif |
| #define rRTN r3 /* Return value */ |
| #define rSTR1 r10 /* 1st string */ |
| #define rSTR2 r4 /* 2nd string */ |
| #define rCHAR1 r6 /* Byte read from 1st string */ |
| #define rCHAR2 r7 /* Byte read from 2nd string */ |
| #define rADDR1 r8 /* Address of tolower(rCHAR1) */ |
| #define rADDR2 r12 /* Address of tolower(rCHAR2) */ |
| #define rLWR1 r8 /* Word tolower(rCHAR1) */ |
| #define rLWR2 r12 /* Word tolower(rCHAR2) */ |
| #define rTMP r9 |
| #define rLOC r11 /* Default locale address */ |
| |
| cmpd cr7, rRTN, rSTR2 |
| |
| /* Get locale address. */ |
| ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) |
| add rLOC, rTMP, __libc_tsd_LOCALE@tls |
| ld rLOC, 0(rLOC) |
| |
| mr rSTR1, rRTN |
| li rRTN, 0 |
| beqlr cr7 |
| #ifdef USE_AS_STRNCASECMP |
| cmpdi cr7, r5, 0 |
| beq cr7, L(retnull) |
| cmpdi cr7, r5, 16 |
| blt cr7, L(bytebybyte) |
| #endif |
| vspltisb v0, 0 |
| vspltisb v8, -1 |
| /* Check for null in initial characters. |
| Check max of 16 char depending on the alignment. |
| If null is present, proceed byte by byte. */ |
| lvx v4, 0, rSTR1 |
| #ifdef __LITTLE_ENDIAN__ |
| lvsr v10, 0, rSTR1 /* Compute mask. */ |
| vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */ |
| #else |
| lvsl v10, 0, rSTR1 |
| vperm v9, v4, v8, v10 |
| #endif |
| vcmpequb. v9, v0, v9 /* Check for null bytes. */ |
| bne cr6, L(bytebybyte) |
| lvx v5, 0, rSTR2 |
| /* Calculate alignment. */ |
| #ifdef __LITTLE_ENDIAN__ |
| lvsr v6, 0, rSTR2 |
| vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */ |
| #else |
| lvsl v6, 0, rSTR2 |
| vperm v9, v5, v8, v6 |
| #endif |
| vcmpequb. v9, v0, v9 /* Check for null bytes. */ |
| bne cr6, L(bytebybyte) |
| /* Check if locale has non ascii characters. */ |
| ld rTMP, 0(rLOC) |
| addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES |
| lwz rTMP, 0(r6) |
| cmpdi cr7, rTMP, 1 |
| beq cr7, L(bytebybyte) |
| |
| /* Load vector registers with values used for TOLOWER. */ |
| /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */ |
| vspltisb v3, 2 |
| vspltisb v9, 4 |
| vsl v3, v3, v9 |
| vaddubm v1, v3, v3 |
| vnor v1, v1, v1 |
| vspltisb v2, 7 |
| vsububm v2, v3, v2 |
| |
| andi. rADDR1, rSTR1, 0xF |
| beq cr0, L(align) |
| addi r6, rSTR1, 16 |
| lvx v9, 0, r6 |
| /* Compute 16 bytes from previous two loads. */ |
| #ifdef __LITTLE_ENDIAN__ |
| vperm v4, v9, v4, v10 |
| #else |
| vperm v4, v4, v9, v10 |
| #endif |
| L(align): |
| andi. rADDR2, rSTR2, 0xF |
| beq cr0, L(align1) |
| addi r6, rSTR2, 16 |
| lvx v9, 0, r6 |
| /* Compute 16 bytes from previous two loads. */ |
| #ifdef __LITTLE_ENDIAN__ |
| vperm v5, v9, v5, v6 |
| #else |
| vperm v5, v5, v9, v6 |
| #endif |
| L(align1): |
| CHECKNULLANDCONVERT() |
| blt cr6, L(match) |
| b L(different) |
| .align 4 |
| L(match): |
| clrldi r6, rSTR1, 60 |
| subfic r7, r6, 16 |
| #ifdef USE_AS_STRNCASECMP |
| sub r5, r5, r7 |
| #endif |
| add rSTR1, rSTR1, r7 |
| add rSTR2, rSTR2, r7 |
| andi. rADDR2, rSTR2, 0xF |
| addi rSTR1, rSTR1, -16 |
| addi rSTR2, rSTR2, -16 |
| beq cr0, L(aligned) |
| #ifdef __LITTLE_ENDIAN__ |
| lvsr v6, 0, rSTR2 |
| #else |
| lvsl v6, 0, rSTR2 |
| #endif |
| /* There are 2 loops depending on the input alignment. |
| Each loop gets 16 bytes from s1 and s2, check for null, |
| convert to lowercase and compare. Loop till difference |
| or null occurs. */ |
| L(s1_align): |
| addi rSTR1, rSTR1, 16 |
| addi rSTR2, rSTR2, 16 |
| #ifdef USE_AS_STRNCASECMP |
| cmpdi cr7, r5, 16 |
| blt cr7, L(bytebybyte) |
| addi r5, r5, -16 |
| #endif |
| lvx v4, 0, rSTR1 |
| GET16BYTES(v5, rSTR2, v6) |
| CHECKNULLANDCONVERT() |
| blt cr6, L(s1_align) |
| b L(different) |
| .align 4 |
| L(aligned): |
| addi rSTR1, rSTR1, 16 |
| addi rSTR2, rSTR2, 16 |
| #ifdef USE_AS_STRNCASECMP |
| cmpdi cr7, r5, 16 |
| blt cr7, L(bytebybyte) |
| addi r5, r5, -16 |
| #endif |
| lvx v4, 0, rSTR1 |
| lvx v5, 0, rSTR2 |
| CHECKNULLANDCONVERT() |
| blt cr6, L(aligned) |
| |
| /* Calculate and return the difference. */ |
| L(different): |
| vaddubm v1, v3, v3 |
| vcmpequb v7, v0, v7 |
| #ifdef __LITTLE_ENDIAN__ |
| /* Count trailing zero. */ |
| vspltisb v8, -1 |
| VADDUQM_V7_V8 |
| vandc v8, v9, v7 |
| VPOPCNTD_V8_V8 |
| vspltb v6, v8, 15 |
| vcmpequb. v6, v6, v1 |
| blt cr6, L(shift8) |
| #else |
| /* Count leading zero. */ |
| VCLZD_V8_v7 |
| vspltb v6, v8, 7 |
| vcmpequb. v6, v6, v1 |
| blt cr6, L(shift8) |
| vsro v8, v8, v1 |
| #endif |
| b L(skipsum) |
| .align 4 |
| L(shift8): |
| vsumsws v8, v8, v0 |
| L(skipsum): |
| #ifdef __LITTLE_ENDIAN__ |
| /* Shift registers based on leading zero count. */ |
| vsro v6, v5, v8 |
| vsro v7, v4, v8 |
| /* Merge and move to GPR. */ |
| vmrglb v6, v6, v7 |
| vslo v1, v6, v1 |
| MFVRD_R3_V1 |
| /* Place the characters that are different in first position. */ |
| sldi rSTR2, rRTN, 56 |
| srdi rSTR2, rSTR2, 56 |
| sldi rSTR1, rRTN, 48 |
| srdi rSTR1, rSTR1, 56 |
| #else |
| vslo v6, v5, v8 |
| vslo v7, v4, v8 |
| vmrghb v1, v6, v7 |
| MFVRD_R3_V1 |
| srdi rSTR2, rRTN, 48 |
| sldi rSTR2, rSTR2, 56 |
| srdi rSTR2, rSTR2, 56 |
| srdi rSTR1, rRTN, 56 |
| #endif |
| subf rRTN, rSTR1, rSTR2 |
| extsw rRTN, rRTN |
| blr |
| |
| .align 4 |
| /* OK. We've hit the end of the string. We need to be careful that |
| we don't compare two strings as different because of junk beyond |
| the end of the strings... */ |
| L(null_found): |
| vaddubm v10, v3, v3 |
| #ifdef __LITTLE_ENDIAN__ |
| /* Count trailing zero. */ |
| vspltisb v8, -1 |
| VADDUQM_V7_V8 |
| vandc v8, v9, v7 |
| VPOPCNTD_V8_V8 |
| vspltb v6, v8, 15 |
| vcmpequb. v6, v6, v10 |
| blt cr6, L(shift_8) |
| #else |
| /* Count leading zero. */ |
| VCLZD_V8_v7 |
| vspltb v6, v8, 7 |
| vcmpequb. v6, v6, v10 |
| blt cr6, L(shift_8) |
| vsro v8, v8, v10 |
| #endif |
| b L(skipsum1) |
| .align 4 |
| L(shift_8): |
| vsumsws v8, v8, v0 |
| L(skipsum1): |
| /* Calculate shift count based on count of zero. */ |
| vspltisb v10, 7 |
| vslb v10, v10, v10 |
| vsldoi v9, v0, v10, 1 |
| VSUBUDM_V9_V8 |
| vspltisb v8, 8 |
| vsldoi v8, v0, v8, 1 |
| VSUBUDM_V9_V8 |
| /* Shift and remove junk after null character. */ |
| #ifdef __LITTLE_ENDIAN__ |
| vslo v5, v5, v9 |
| vslo v4, v4, v9 |
| #else |
| vsro v5, v5, v9 |
| vsro v4, v4, v9 |
| #endif |
| /* Convert and compare 16 bytes. */ |
| TOLOWER() |
| blt cr6, L(retnull) |
| b L(different) |
| .align 4 |
| L(retnull): |
| li rRTN, 0 |
| blr |
| .align 4 |
| L(bytebybyte): |
| /* Unrolling loop for POWER: loads are done with 'lbz' plus |
| offset and string descriptors are only updated in the end |
| of loop unrolling. */ |
| ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) |
| lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ |
| lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ |
| #ifdef USE_AS_STRNCASECMP |
| rldicl rTMP, r5, 62, 2 |
| cmpdi cr7, rTMP, 0 |
| beq cr7, L(lessthan4) |
| mtctr rTMP |
| #endif |
| L(loop): |
| cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ |
| sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ |
| sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ |
| lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ |
| lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ |
| cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ |
| crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ |
| beq cr1, L(done) |
| lbz rCHAR1, 1(rSTR1) |
| lbz rCHAR2, 1(rSTR2) |
| cmpdi rCHAR1, 0 |
| sldi rADDR1, rCHAR1, 2 |
| sldi rADDR2, rCHAR2, 2 |
| lwzx rLWR1, rLOC, rADDR1 |
| lwzx rLWR2, rLOC, rADDR2 |
| cmpw cr1, rLWR1, rLWR2 |
| crorc 4*cr1+eq,eq,4*cr1+eq |
| beq cr1, L(done) |
| lbz rCHAR1, 2(rSTR1) |
| lbz rCHAR2, 2(rSTR2) |
| cmpdi rCHAR1, 0 |
| sldi rADDR1, rCHAR1, 2 |
| sldi rADDR2, rCHAR2, 2 |
| lwzx rLWR1, rLOC, rADDR1 |
| lwzx rLWR2, rLOC, rADDR2 |
| cmpw cr1, rLWR1, rLWR2 |
| crorc 4*cr1+eq,eq,4*cr1+eq |
| beq cr1, L(done) |
| lbz rCHAR1, 3(rSTR1) |
| lbz rCHAR2, 3(rSTR2) |
| cmpdi rCHAR1, 0 |
| /* Increment both string descriptors */ |
| addi rSTR1, rSTR1, 4 |
| addi rSTR2, rSTR2, 4 |
| sldi rADDR1, rCHAR1, 2 |
| sldi rADDR2, rCHAR2, 2 |
| lwzx rLWR1, rLOC, rADDR1 |
| lwzx rLWR2, rLOC, rADDR2 |
| cmpw cr1, rLWR1, rLWR2 |
| crorc 4*cr1+eq,eq,4*cr1+eq |
| beq cr1, L(done) |
| lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ |
| lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ |
| #ifdef USE_AS_STRNCASECMP |
| bdnz L(loop) |
| #else |
| b L(loop) |
| #endif |
| #ifdef USE_AS_STRNCASECMP |
| L(lessthan4): |
| clrldi r5, r5, 62 |
| cmpdi cr7, r5, 0 |
| beq cr7, L(retnull) |
| mtctr r5 |
| L(loop1): |
| cmpdi rCHAR1, 0 |
| sldi rADDR1, rCHAR1, 2 |
| sldi rADDR2, rCHAR2, 2 |
| lwzx rLWR1, rLOC, rADDR1 |
| lwzx rLWR2, rLOC, rADDR2 |
| cmpw cr1, rLWR1, rLWR2 |
| crorc 4*cr1+eq,eq,4*cr1+eq |
| beq cr1, L(done) |
| addi rSTR1, rSTR1, 1 |
| addi rSTR2, rSTR2, 1 |
| lbz rCHAR1, 0(rSTR1) |
| lbz rCHAR2, 0(rSTR2) |
| bdnz L(loop1) |
| #endif |
| L(done): |
| subf r0, rLWR2, rLWR1 |
| extsw rRTN, r0 |
| blr |
| END (__STRCASECMP) |
| |
| weak_alias (__STRCASECMP, STRCASECMP) |
| libc_hidden_builtin_def (__STRCASECMP) |