| /* strrchr/wcsrchr optimized with AVX2. |
| Copyright (C) 2017-2018 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #if IS_IN (libc) |
| |
| # include <sysdep.h> |
| |
| # ifndef STRRCHR |
| # define STRRCHR __strrchr_avx2 |
| # endif |
| |
| # ifdef USE_AS_WCSRCHR |
| # define VPBROADCAST vpbroadcastd |
| # define VPCMPEQ vpcmpeqd |
| # else |
| # define VPBROADCAST vpbroadcastb |
| # define VPCMPEQ vpcmpeqb |
| # endif |
| |
| # ifndef VZEROUPPER |
| # define VZEROUPPER vzeroupper |
| # endif |
| |
| # define VEC_SIZE 32 |
| |
| .section .text.avx,"ax",@progbits |
| ENTRY (STRRCHR) |
| movd %esi, %xmm4 |
| movl %edi, %ecx |
| /* Broadcast CHAR to YMM4. */ |
| VPBROADCAST %xmm4, %ymm4 |
| vpxor %ymm0, %ymm0, %ymm0 |
| |
| /* Check if we may cross page boundary with one vector load. */ |
| andl $(2 * VEC_SIZE - 1), %ecx |
| cmpl $VEC_SIZE, %ecx |
| ja L(cros_page_boundary) |
| |
| vmovdqu (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %ecx |
| vpmovmskb %ymm3, %eax |
| addq $VEC_SIZE, %rdi |
| |
| testl %eax, %eax |
| jnz L(first_vec) |
| |
| testl %ecx, %ecx |
| jnz L(return_null) |
| |
| andq $-VEC_SIZE, %rdi |
| xorl %edx, %edx |
| jmp L(aligned_loop) |
| |
| .p2align 4 |
| L(first_vec): |
| /* Check if there is a nul CHAR. */ |
| testl %ecx, %ecx |
| jnz L(char_and_nul_in_first_vec) |
| |
| /* Remember the match and keep searching. */ |
| movl %eax, %edx |
| movq %rdi, %rsi |
| andq $-VEC_SIZE, %rdi |
| jmp L(aligned_loop) |
| |
| .p2align 4 |
| L(cros_page_boundary): |
| andl $(VEC_SIZE - 1), %ecx |
| andq $-VEC_SIZE, %rdi |
| vmovdqa (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %edx |
| vpmovmskb %ymm3, %eax |
| shrl %cl, %edx |
| shrl %cl, %eax |
| addq $VEC_SIZE, %rdi |
| |
| /* Check if there is a CHAR. */ |
| testl %eax, %eax |
| jnz L(found_char) |
| |
| testl %edx, %edx |
| jnz L(return_null) |
| |
| jmp L(aligned_loop) |
| |
| .p2align 4 |
| L(found_char): |
| testl %edx, %edx |
| jnz L(char_and_nul) |
| |
| /* Remember the match and keep searching. */ |
| movl %eax, %edx |
| leaq (%rdi, %rcx), %rsi |
| |
| .p2align 4 |
| L(aligned_loop): |
| vmovdqa (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| addq $VEC_SIZE, %rdi |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %ecx |
| vpmovmskb %ymm3, %eax |
| orl %eax, %ecx |
| jnz L(char_nor_null) |
| |
| vmovdqa (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| add $VEC_SIZE, %rdi |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %ecx |
| vpmovmskb %ymm3, %eax |
| orl %eax, %ecx |
| jnz L(char_nor_null) |
| |
| vmovdqa (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| addq $VEC_SIZE, %rdi |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %ecx |
| vpmovmskb %ymm3, %eax |
| orl %eax, %ecx |
| jnz L(char_nor_null) |
| |
| vmovdqa (%rdi), %ymm1 |
| VPCMPEQ %ymm1, %ymm0, %ymm2 |
| addq $VEC_SIZE, %rdi |
| VPCMPEQ %ymm1, %ymm4, %ymm3 |
| vpmovmskb %ymm2, %ecx |
| vpmovmskb %ymm3, %eax |
| orl %eax, %ecx |
| jz L(aligned_loop) |
| |
| .p2align 4 |
| L(char_nor_null): |
| /* Find a CHAR or a nul CHAR in a loop. */ |
| testl %eax, %eax |
| jnz L(match) |
| L(return_value): |
| testl %edx, %edx |
| jz L(return_null) |
| movl %edx, %eax |
| movq %rsi, %rdi |
| |
| # ifdef USE_AS_WCSRCHR |
| /* Keep the first bit for each matching CHAR for bsr. */ |
| andl $0x11111111, %eax |
| # endif |
| bsrl %eax, %eax |
| leaq -VEC_SIZE(%rdi, %rax), %rax |
| VZEROUPPER |
| ret |
| |
| .p2align 4 |
| L(match): |
| /* Find a CHAR. Check if there is a nul CHAR. */ |
| vpmovmskb %ymm2, %ecx |
| testl %ecx, %ecx |
| jnz L(find_nul) |
| |
| /* Remember the match and keep searching. */ |
| movl %eax, %edx |
| movq %rdi, %rsi |
| jmp L(aligned_loop) |
| |
| .p2align 4 |
| L(find_nul): |
| # ifdef USE_AS_WCSRCHR |
| /* Keep the first bit for each matching CHAR for bsr. */ |
| andl $0x11111111, %ecx |
| andl $0x11111111, %eax |
| # endif |
| /* Mask out any matching bits after the nul CHAR. */ |
| movl %ecx, %r8d |
| subl $1, %r8d |
| xorl %ecx, %r8d |
| andl %r8d, %eax |
| testl %eax, %eax |
| /* If there is no CHAR here, return the remembered one. */ |
| jz L(return_value) |
| bsrl %eax, %eax |
| leaq -VEC_SIZE(%rdi, %rax), %rax |
| VZEROUPPER |
| ret |
| |
| .p2align 4 |
| L(char_and_nul): |
| /* Find both a CHAR and a nul CHAR. */ |
| addq %rcx, %rdi |
| movl %edx, %ecx |
| L(char_and_nul_in_first_vec): |
| # ifdef USE_AS_WCSRCHR |
| /* Keep the first bit for each matching CHAR for bsr. */ |
| andl $0x11111111, %ecx |
| andl $0x11111111, %eax |
| # endif |
| /* Mask out any matching bits after the nul CHAR. */ |
| movl %ecx, %r8d |
| subl $1, %r8d |
| xorl %ecx, %r8d |
| andl %r8d, %eax |
| testl %eax, %eax |
| /* Return null pointer if the nul CHAR comes first. */ |
| jz L(return_null) |
| bsrl %eax, %eax |
| leaq -VEC_SIZE(%rdi, %rax), %rax |
| VZEROUPPER |
| ret |
| |
| .p2align 4 |
| L(return_null): |
| xorl %eax, %eax |
| VZEROUPPER |
| ret |
| |
| END (STRRCHR) |
| #endif |