| /* Optimized memchr with sse2 without bsf |
| Copyright (C) 2011-2014 Free Software Foundation, Inc. |
| Contributed by Intel Corporation. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #ifndef NOT_IN_libc |
| |
| # include <sysdep.h> |
| |
| # define CFI_PUSH(REG) \ |
| cfi_adjust_cfa_offset (4); \ |
| cfi_rel_offset (REG, 0) |
| |
| # define CFI_POP(REG) \ |
| cfi_adjust_cfa_offset (-4); \ |
| cfi_restore (REG) |
| |
| # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
| # define POP(REG) popl REG; CFI_POP (REG) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| # define ENTRANCE PUSH(%edi); |
| # define PARMS 8 |
| # define RETURN POP(%edi); ret; CFI_PUSH(%edi); |
| # else |
| # define ENTRANCE |
| # define PARMS 4 |
| # endif |
| |
| # define STR1 PARMS |
| # define STR2 STR1+4 |
| |
| # ifndef USE_AS_RAWMEMCHR |
| # define LEN STR2+4 |
| # endif |
| |
| # ifndef MEMCHR |
| # define MEMCHR __memchr_sse2 |
| # endif |
| |
| atom_text_section |
| ENTRY (MEMCHR) |
| ENTRANCE |
| mov STR1(%esp), %ecx |
| movd STR2(%esp), %xmm1 |
| # ifndef USE_AS_RAWMEMCHR |
| mov LEN(%esp), %edx |
| test %edx, %edx |
| jz L(return_null) |
| # endif |
| |
| punpcklbw %xmm1, %xmm1 |
| # ifndef USE_AS_RAWMEMCHR |
| mov %ecx, %edi |
| # else |
| mov %ecx, %edx |
| # endif |
| punpcklbw %xmm1, %xmm1 |
| |
| and $63, %ecx |
| pshufd $0, %xmm1, %xmm1 |
| cmp $48, %ecx |
| ja L(crosscache) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqu (%edi), %xmm0 |
| # else |
| movdqu (%edx), %xmm0 |
| # endif |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| # ifndef USE_AS_RAWMEMCHR |
| jnz L(match_case2_prolog) |
| |
| sub $16, %edx |
| jbe L(return_null) |
| lea 16(%edi), %edi |
| and $15, %ecx |
| and $-16, %edi |
| add %ecx, %edx |
| # else |
| jnz L(match_case1_prolog) |
| lea 16(%edx), %edx |
| and $-16, %edx |
| # endif |
| jmp L(loop_prolog) |
| |
| .p2align 4 |
| L(crosscache): |
| and $15, %ecx |
| # ifndef USE_AS_RAWMEMCHR |
| and $-16, %edi |
| movdqa (%edi), %xmm0 |
| # else |
| and $-16, %edx |
| movdqa (%edx), %xmm0 |
| # endif |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| sar %cl, %eax |
| test %eax, %eax |
| |
| # ifndef USE_AS_RAWMEMCHR |
| jnz L(match_case2_prolog1) |
| lea -16(%edx), %edx |
| add %ecx, %edx |
| jle L(return_null) |
| lea 16(%edi), %edi |
| # else |
| jnz L(match_case1_prolog1) |
| lea 16(%edx), %edx |
| # endif |
| |
| .p2align 4 |
| L(loop_prolog): |
| # ifndef USE_AS_RAWMEMCHR |
| sub $64, %edx |
| jbe L(exit_loop) |
| movdqa (%edi), %xmm0 |
| # else |
| movdqa (%edx), %xmm0 |
| # endif |
| pcmpeqb %xmm1, %xmm0 |
| xor %ecx, %ecx |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 16(%edi), %xmm2 |
| # else |
| movdqa 16(%edx), %xmm2 |
| # endif |
| pcmpeqb %xmm1, %xmm2 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 32(%edi), %xmm3 |
| # else |
| movdqa 32(%edx), %xmm3 |
| # endif |
| pcmpeqb %xmm1, %xmm3 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 48(%edi), %xmm4 |
| # else |
| movdqa 48(%edx), %xmm4 |
| # endif |
| pcmpeqb %xmm1, %xmm4 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm4, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| lea 64(%edi), %edi |
| sub $64, %edx |
| jbe L(exit_loop) |
| |
| movdqa (%edi), %xmm0 |
| # else |
| lea 64(%edx), %edx |
| movdqa (%edx), %xmm0 |
| # endif |
| pcmpeqb %xmm1, %xmm0 |
| xor %ecx, %ecx |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 16(%edi), %xmm2 |
| # else |
| movdqa 16(%edx), %xmm2 |
| # endif |
| pcmpeqb %xmm1, %xmm2 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 32(%edi), %xmm3 |
| # else |
| movdqa 32(%edx), %xmm3 |
| # endif |
| pcmpeqb %xmm1, %xmm3 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 48(%edi), %xmm4 |
| # else |
| movdqa 48(%edx), %xmm4 |
| # endif |
| pcmpeqb %xmm1, %xmm4 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm4, %eax |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| lea 64(%edi), %edi |
| mov %edi, %ecx |
| and $-64, %edi |
| and $63, %ecx |
| add %ecx, %edx |
| # else |
| lea 64(%edx), %edx |
| and $-64, %edx |
| # endif |
| |
| .p2align 4 |
| L(align64_loop): |
| |
| # ifndef USE_AS_RAWMEMCHR |
| sub $64, %edx |
| jbe L(exit_loop) |
| movdqa (%edi), %xmm0 |
| movdqa 16(%edi), %xmm2 |
| movdqa 32(%edi), %xmm3 |
| movdqa 48(%edi), %xmm4 |
| # else |
| movdqa (%edx), %xmm0 |
| movdqa 16(%edx), %xmm2 |
| movdqa 32(%edx), %xmm3 |
| movdqa 48(%edx), %xmm4 |
| # endif |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm1, %xmm2 |
| pcmpeqb %xmm1, %xmm3 |
| pcmpeqb %xmm1, %xmm4 |
| |
| pmaxub %xmm0, %xmm3 |
| pmaxub %xmm2, %xmm4 |
| pmaxub %xmm3, %xmm4 |
| # ifndef USE_AS_RAWMEMCHR |
| add $64, %edi |
| # else |
| add $64, %edx |
| # endif |
| pmovmskb %xmm4, %eax |
| |
| test %eax, %eax |
| jz L(align64_loop) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| sub $64, %edi |
| # else |
| sub $64, %edx |
| # endif |
| |
| pmovmskb %xmm0, %eax |
| xor %ecx, %ecx |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| pmovmskb %xmm2, %eax |
| lea 16(%ecx), %ecx |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| movdqa 32(%edi), %xmm3 |
| # else |
| movdqa 32(%edx), %xmm3 |
| # endif |
| pcmpeqb %xmm1, %xmm3 |
| pmovmskb %xmm3, %eax |
| lea 16(%ecx), %ecx |
| test %eax, %eax |
| jnz L(match_case1) |
| |
| # ifndef USE_AS_RAWMEMCHR |
| pcmpeqb 48(%edi), %xmm1 |
| # else |
| pcmpeqb 48(%edx), %xmm1 |
| # endif |
| pmovmskb %xmm1, %eax |
| lea 16(%ecx), %ecx |
| |
| .p2align 4 |
| L(match_case1): |
| # ifndef USE_AS_RAWMEMCHR |
| add %ecx, %edi |
| # else |
| L(match_case1_prolog1): |
| add %ecx, %edx |
| L(match_case1_prolog): |
| # endif |
| test %al, %al |
| jz L(match_case1_high) |
| mov %al, %cl |
| and $15, %cl |
| jz L(match_case1_8) |
| test $0x01, %al |
| jnz L(ExitCase1_1) |
| test $0x02, %al |
| jnz L(ExitCase1_2) |
| test $0x04, %al |
| jnz L(ExitCase1_3) |
| # ifndef USE_AS_RAWMEMCHR |
| lea 3(%edi), %eax |
| RETURN |
| # else |
| lea 3(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(match_case1_8): |
| test $0x10, %al |
| jnz L(ExitCase1_5) |
| test $0x20, %al |
| jnz L(ExitCase1_6) |
| test $0x40, %al |
| jnz L(ExitCase1_7) |
| # ifndef USE_AS_RAWMEMCHR |
| lea 7(%edi), %eax |
| RETURN |
| # else |
| lea 7(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(match_case1_high): |
| mov %ah, %ch |
| and $15, %ch |
| jz L(match_case1_high_8) |
| test $0x01, %ah |
| jnz L(ExitCase1_9) |
| test $0x02, %ah |
| jnz L(ExitCase1_10) |
| test $0x04, %ah |
| jnz L(ExitCase1_11) |
| # ifndef USE_AS_RAWMEMCHR |
| lea 11(%edi), %eax |
| RETURN |
| # else |
| lea 11(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(match_case1_high_8): |
| test $0x10, %ah |
| jnz L(ExitCase1_13) |
| test $0x20, %ah |
| jnz L(ExitCase1_14) |
| test $0x40, %ah |
| jnz L(ExitCase1_15) |
| # ifndef USE_AS_RAWMEMCHR |
| lea 15(%edi), %eax |
| RETURN |
| # else |
| lea 15(%edx), %eax |
| ret |
| # endif |
| |
| # ifndef USE_AS_RAWMEMCHR |
| .p2align 4 |
| L(exit_loop): |
| add $64, %edx |
| |
| movdqa (%edi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| xor %ecx, %ecx |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(match_case2) |
| cmp $16, %edx |
| jbe L(return_null) |
| |
| movdqa 16(%edi), %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(match_case2) |
| cmp $32, %edx |
| jbe L(return_null) |
| |
| movdqa 32(%edi), %xmm3 |
| pcmpeqb %xmm1, %xmm3 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(match_case2) |
| cmp $48, %edx |
| jbe L(return_null) |
| |
| pcmpeqb 48(%edi), %xmm1 |
| lea 16(%ecx), %ecx |
| pmovmskb %xmm1, %eax |
| test %eax, %eax |
| jnz L(match_case2) |
| |
| xor %eax, %eax |
| RETURN |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_1): |
| # ifndef USE_AS_RAWMEMCHR |
| mov %edi, %eax |
| RETURN |
| # else |
| mov %edx, %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_2): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 1(%edi), %eax |
| RETURN |
| # else |
| lea 1(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_3): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 2(%edi), %eax |
| RETURN |
| # else |
| lea 2(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_5): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 4(%edi), %eax |
| RETURN |
| # else |
| lea 4(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_6): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 5(%edi), %eax |
| RETURN |
| # else |
| lea 5(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_7): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 6(%edi), %eax |
| RETURN |
| # else |
| lea 6(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_9): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 8(%edi), %eax |
| RETURN |
| # else |
| lea 8(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_10): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 9(%edi), %eax |
| RETURN |
| # else |
| lea 9(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_11): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 10(%edi), %eax |
| RETURN |
| # else |
| lea 10(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_13): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 12(%edi), %eax |
| RETURN |
| # else |
| lea 12(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_14): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 13(%edi), %eax |
| RETURN |
| # else |
| lea 13(%edx), %eax |
| ret |
| # endif |
| |
| .p2align 4 |
| L(ExitCase1_15): |
| # ifndef USE_AS_RAWMEMCHR |
| lea 14(%edi), %eax |
| RETURN |
| # else |
| lea 14(%edx), %eax |
| ret |
| # endif |
| |
| # ifndef USE_AS_RAWMEMCHR |
| .p2align 4 |
| L(match_case2): |
| sub %ecx, %edx |
| L(match_case2_prolog1): |
| add %ecx, %edi |
| L(match_case2_prolog): |
| test %al, %al |
| jz L(match_case2_high) |
| mov %al, %cl |
| and $15, %cl |
| jz L(match_case2_8) |
| test $0x01, %al |
| jnz L(ExitCase2_1) |
| test $0x02, %al |
| jnz L(ExitCase2_2) |
| test $0x04, %al |
| jnz L(ExitCase2_3) |
| sub $4, %edx |
| jb L(return_null) |
| lea 3(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(match_case2_8): |
| test $0x10, %al |
| jnz L(ExitCase2_5) |
| test $0x20, %al |
| jnz L(ExitCase2_6) |
| test $0x40, %al |
| jnz L(ExitCase2_7) |
| sub $8, %edx |
| jb L(return_null) |
| lea 7(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(match_case2_high): |
| mov %ah, %ch |
| and $15, %ch |
| jz L(match_case2_high_8) |
| test $0x01, %ah |
| jnz L(ExitCase2_9) |
| test $0x02, %ah |
| jnz L(ExitCase2_10) |
| test $0x04, %ah |
| jnz L(ExitCase2_11) |
| sub $12, %edx |
| jb L(return_null) |
| lea 11(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(match_case2_high_8): |
| test $0x10, %ah |
| jnz L(ExitCase2_13) |
| test $0x20, %ah |
| jnz L(ExitCase2_14) |
| test $0x40, %ah |
| jnz L(ExitCase2_15) |
| sub $16, %edx |
| jb L(return_null) |
| lea 15(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_1): |
| mov %edi, %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_2): |
| sub $2, %edx |
| jb L(return_null) |
| lea 1(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_3): |
| sub $3, %edx |
| jb L(return_null) |
| lea 2(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_5): |
| sub $5, %edx |
| jb L(return_null) |
| lea 4(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_6): |
| sub $6, %edx |
| jb L(return_null) |
| lea 5(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_7): |
| sub $7, %edx |
| jb L(return_null) |
| lea 6(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_9): |
| sub $9, %edx |
| jb L(return_null) |
| lea 8(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_10): |
| sub $10, %edx |
| jb L(return_null) |
| lea 9(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_11): |
| sub $11, %edx |
| jb L(return_null) |
| lea 10(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_13): |
| sub $13, %edx |
| jb L(return_null) |
| lea 12(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_14): |
| sub $14, %edx |
| jb L(return_null) |
| lea 13(%edi), %eax |
| RETURN |
| |
| .p2align 4 |
| L(ExitCase2_15): |
| sub $15, %edx |
| jb L(return_null) |
| lea 14(%edi), %eax |
| RETURN |
| # endif |
| |
| .p2align 4 |
| L(return_null): |
| xor %eax, %eax |
| # ifndef USE_AS_RAWMEMCHR |
| RETURN |
| # else |
| ret |
| # endif |
| |
| END (MEMCHR) |
| #endif |