| /* Copy SIZE bytes from SRC to DEST. For SUN4V M7. |
| Copyright (C) 2017-2018 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| |
| #ifndef XCC |
| # define XCC xcc |
| #endif |
| .register %g2,#scratch |
| .register %g3,#scratch |
| .register %g6,#scratch |
| |
| #define FPRS_FEF 0x04 |
| |
| /* |
| * ASI_STBI_P marks the cache line as "least recently used" |
| * which means if many threads are active, it has a high chance |
| * of being pushed out of the cache between the first initializing |
| * store and the final stores. |
| * Thus, in this algorithm we use ASI_STBIMRU_P which marks the |
| * cache line as "most recently used" for all but the last cache |
| * line. |
| */ |
| |
| #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 |
| #define ASI_ST_BLK_INIT_MRU_P 0xf2 |
| |
| #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P |
| #define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P |
| |
| #define BLOCK_SIZE 64 /* L2 data cache line size */ |
| #define SHORTCOPY 3 |
| #define SHORTCHECK 14 |
| #define SHORT_LONG 64 /* max copy for short longword-aligned case */ |
| /* must be at least 64 */ |
| #define SMALL_MAX 255 /* max small copy for word/long aligned */ |
| #define SMALL_UMAX 128 /* max small copy for unaligned case */ |
| #define MED_WMAX 1023 /* max copy for medium word-aligned case */ |
| #define MED_MAX 511 /* max copy for medium longword-aligned case */ |
| #define ST_CHUNK 20 /* ST_CHUNK - block of values for BIS Store */ |
| /* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache |
| * prefetch 20 can cause inst pipeline to delay if data is in memory |
| * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache */ |
| #define ALIGN_PRE 20 /* distance for aligned prefetch loop */ |
| |
| #define EX_ST(x) x |
| #define EX_RETVAL(x) x |
| #define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P |
| #define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P |
| |
| #if IS_IN (libc) |
| |
| .text |
| |
| ENTRY(__memmove_niagara7) |
| /* %o0=dst, %o1=src, %o2=len */ |
| cmp %o1, %o0 /* if from address is >= to use forward copy */ |
| bgeu,pn %XCC, .Lforcpy /* else use backward if ... */ |
| sub %o0, %o1, %o4 /* get difference of two addresses */ |
| cmp %o2, %o4 /* compare size and difference of addresses */ |
| bleu,pn %XCC, .Lforcpy /* if size is bigger, do overlapped copy */ |
| add %o1, %o2, %o5 /* get to end of source space */ |
| |
| /* an overlapped copy that must be done "backwards" */ |
| .Lchksize: |
| cmp %o2, 8 /* less than 8 byte do byte copy */ |
| blu,pn %XCC, 2f /* else continue */ |
| |
| /* Now size is bigger than 8 */ |
| .Ldbalign: |
| add %o0, %o2, %g1 /* get to end of dest space */ |
| andcc %g1, 7, %o3 /* %o3 has cnt til dst 8 byte align */ |
| bz,a,pn %XCC, .Ldbbck /* skip if dst is 8 byte aligned */ |
| andn %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ |
| sub %o2, %o3, %o2 /* update o2 with new count */ |
| |
| 1: dec %o5 /* decrement source */ |
| ldub [%o5], %g1 /* load one byte */ |
| deccc %o3 /* decrement count */ |
| bgu,pt %XCC, 1b /* if not done keep copying */ |
| stb %g1, [%o5+%o4] /* store one byte into dest */ |
| andncc %o2, 7, %o3 /* force %o3 cnt to multiple of 8 */ |
| bz,pn %XCC, 2f /* if size < 8, move to byte copy */ |
| |
| /* Now Destination is 8 byte aligned */ |
| .Ldbbck: |
| andcc %o5, 7, %o0 /* %o0 has src offset */ |
| bz,a,pn %XCC, .Ldbcopybc /* if src is aligned do fast memmove */ |
| sub %o2, %o3, %o2 /* Residue bytes in %o2 */ |
| |
| .Lcpy_dbwdbc: /* alignment of src is needed */ |
| sub %o2, 8, %o2 /* set size one loop ahead */ |
| sll %o0, 3, %g1 /* %g1 is left shift */ |
| mov 64, %g5 /* init %g5 to be 64 */ |
| sub %g5, %g1, %g5 /* %g5 rightshift = (64 - leftshift) */ |
| sub %o5, %o0, %o5 /* align the src at 8 bytes. */ |
| add %o4, %o0, %o4 /* increase diff between src & dst */ |
| ldx [%o5], %o1 /* load first 8 bytes */ |
| srlx %o1, %g5, %o1 |
| 1: sub %o5, 8, %o5 /* subtract 8 from src */ |
| ldx [%o5], %o0 /* load 8 byte */ |
| sllx %o0, %g1, %o3 /* shift loaded val left to tmp reg */ |
| or %o1, %o3, %o3 /* align data */ |
| stx %o3, [%o5+%o4] /* store 8 byte */ |
| subcc %o2, 8, %o2 /* subtract 8 byte from size */ |
| bg,pt %XCC, 1b /* if size > 0 continue */ |
| srlx %o0, %g5, %o1 /* move extra byte for the next use */ |
| |
| srl %g1, 3, %o0 /* restore %o0 value for alignment */ |
| add %o5, %o0, %o5 /* restore src alignment */ |
| sub %o4, %o0, %o4 /* restore diff between src & dest */ |
| |
| ba 2f /* branch to the trailing byte copy */ |
| add %o2, 8, %o2 /* restore size value */ |
| |
| .Ldbcopybc: /* alignment of src is not needed */ |
| 1: sub %o5, 8, %o5 /* subtract from src */ |
| ldx [%o5], %g1 /* load 8 bytes */ |
| subcc %o3, 8, %o3 /* subtract from size */ |
| bgu,pt %XCC, 1b /* if size is bigger 0 continue */ |
| stx %g1, [%o5+%o4] /* store 8 bytes to destination */ |
| |
| ba 2f |
| nop |
| |
| .Lbcbyte: |
| 1: ldub [%o5], %g1 /* load one byte */ |
| stb %g1, [%o5+%o4] /* store one byte */ |
| 2: deccc %o2 /* decrement size */ |
| bgeu,a,pt %XCC, 1b /* if size is >= 0 continue */ |
| dec %o5 /* decrement from address */ |
| |
| .Lexitbc: /* exit from backward copy */ |
| retl |
| add %o5, %o4, %o0 /* restore dest addr */ |
| |
| |
| /* Check to see if memmove is large aligned copy |
| * If so, use special version of copy that avoids |
| * use of block store init. */ |
| .Lforcpy: |
| cmp %o2, SMALL_MAX /* check for not small case */ |
| blt,pn %XCC, .Lmv_short /* merge with memcpy */ |
| mov %o0, %g1 /* save %o0 */ |
| neg %o0, %o5 |
| andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ |
| brz,pt %o5, .Lmv_dst_aligned_on_8 |
| |
| /* %o5 has the bytes to be written in partial store. */ |
| sub %o2, %o5, %o2 |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| 7: /* dst aligning loop */ |
| ldub [%o1+%o0], %o4 /* load one byte */ |
| subcc %o5, 1, %o5 |
| stb %o4, [%o0] |
| bgu,pt %XCC, 7b |
| add %o0, 1, %o0 /* advance dst */ |
| add %o1, %o0, %o1 /* restore %o1 */ |
| .Lmv_dst_aligned_on_8: |
| andcc %o1, 7, %o5 |
| brnz,pn %o5, .Lsrc_dst_unaligned_on_8 |
| prefetch [%o1 + (1 * BLOCK_SIZE)], 20 |
| |
| .Lmv_src_dst_aligned_on_8: |
| /* check if we are copying MED_MAX or more bytes */ |
| cmp %o2, MED_MAX /* limit to store buffer size */ |
| bleu,pt %XCC, .Lmedlong |
| prefetch [%o1 + (2 * BLOCK_SIZE)], 20 |
| |
| /* The mv_align loop below mimics the memcpy code for large aligned copies, |
| * but does not use the ASI_STBI_P (block initializing store) performance |
| * optimization. This is used when memcpy is incorrectly invoked with |
| * overlapping buffers. */ |
| |
| .Lmv_large_align8_copy: /* Src and dst share 8 byte align */ |
| /* align dst to 64 byte boundary */ |
| andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ |
| brz,pn %o3, .Lmv_aligned_on_64 |
| sub %o3, 64, %o3 /* %o3 has negative bytes to move */ |
| add %o2, %o3, %o2 /* adjust remaining count */ |
| .Lmv_align_to_64: |
| ldx [%o1], %o4 |
| add %o1, 8, %o1 /* increment src ptr */ |
| addcc %o3, 8, %o3 |
| stx %o4, [%o0] |
| brnz,pt %o3, .Lmv_align_to_64 |
| add %o0, 8, %o0 /* increment dst ptr */ |
| |
| .Lmv_aligned_on_64: |
| andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ |
| and %o2, 0x3f, %o2 /* residue bytes in %o2 */ |
| .Lmv_align_loop: |
| ldx [%o1],%o4 |
| stx %o4,[%o0] |
| prefetch [%o0 + (10 * BLOCK_SIZE)], 22 |
| prefetch [%o1 + (10 * BLOCK_SIZE)], 21 |
| subcc %o5, 64, %o5 |
| ldx [%o1+8],%o4 |
| stx %o4,[%o0+8] |
| ldx [%o1+16],%o4 |
| stx %o4,[%o0+16] |
| ldx [%o1+24],%o4 |
| stx %o4,[%o0+24] |
| ldx [%o1+32],%o4 |
| stx %o4,[%o0+32] |
| ldx [%o1+40],%o4 |
| stx %o4,[%o0+40] |
| ldx [%o1+48],%o4 |
| add %o1, 64, %o1 |
| stx %o4,[%o0+48] |
| add %o0, 64, %o0 |
| ldx [%o1-8],%o4 |
| bgt,pt %XCC, .Lmv_align_loop |
| stx %o4,[%o0-8] |
| |
| ba .Lmedlong |
| nop |
| END(__memmove_niagara7) |
| |
| ENTRY(__mempcpy_niagara7) |
| /* %o0=dst, %o1=src, %o2=len */ |
| ba,pt %icc, 101f |
| add %o0, %o2, %g1 /* save dst + len */ |
| END(__mempcpy_niagara7) |
| |
| .align 32 |
| ENTRY(__memcpy_niagara7) |
| 100: /* %o0=dst, %o1=src, %o2=len */ |
| mov %o0, %g1 /* save %o0 */ |
| 101: |
| #ifndef __arch64__ |
| srl %o2, 0, %o2 |
| #endif |
| cmp %o2, SMALL_MAX /* check for not small case */ |
| bgeu,pn %XCC, .Lmedium /* go to larger cases */ |
| .Lmv_short: |
| cmp %o2, SHORTCOPY /* check for really short case */ |
| ble,pn %XCC, .Lsmallfin |
| or %o0, %o1, %o4 /* prepare alignment check */ |
| andcc %o4, 0x3, %o5 /* test for word alignment */ |
| bnz,pn %XCC, .Lsmallunalign /* branch to non-word aligned case */ |
| nop |
| subcc %o2, 7, %o2 /* adjust count */ |
| ble,pn %XCC, .Lsmallwordx |
| andcc %o4, 0x7, %o5 /* test for long alignment */ |
| /* 8 or more bytes, src and dest start on word boundary |
| * %o4 contains or %o0, %o1 */ |
| .Lsmalllong: |
| bnz,pn %XCC, .Lsmallwords /* branch to word aligned case */ |
| cmp %o2, SHORT_LONG-7 |
| bge,a %XCC, .Lmedl64 /* if we branch */ |
| sub %o2,56,%o2 /* adjust %o2 to -63 off count */ |
| |
| /* slightly unroll the small_long_loop to improve very short copies */ |
| cmp %o2, 32-7 |
| blt,a,pn %XCC, .Lsmall_long_l |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| ldx [%o1], %o5 |
| ldx [%o1+8], %o4 |
| ldx [%o1+16], %o3 |
| |
| subcc %o2, 24, %o2 |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| stx %o5, [%o0] /* write word */ |
| stx %o4, [%o0+8] /* write word */ |
| stx %o3, [%o0+16] /* write word */ |
| |
| add %o0, 24, %o0 |
| |
| /* end loop unroll */ |
| |
| .Lsmall_long_l: |
| ldx [%o1+%o0], %o3 |
| subcc %o2, 8, %o2 |
| add %o0, 8, %o0 |
| bgu,pn %XCC, .Lsmall_long_l /* loop until done */ |
| stx %o3, [%o0-8] /* write word */ |
| addcc %o2, 7, %o2 /* restore %o2 to correct count */ |
| bnz,pn %XCC, .Lsmall_long_x /* check for completion */ |
| add %o1, %o0, %o1 /* restore %o1 */ |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| .Lsmall_long_x: |
| cmp %o2, 4 /* check for 4 or more bytes left */ |
| blt,pn %XCC, .Lsmallleft3 /* if not, go to finish up */ |
| nop |
| lduw [%o1], %o3 |
| add %o1, 4, %o1 |
| subcc %o2, 4, %o2 |
| stw %o3, [%o0] |
| bnz,pn %XCC, .Lsmallleft3 |
| add %o0, 4, %o0 |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 32 |
| /* src and dest start on word boundary; 7 or fewer bytes */ |
| .Lsmallwordx: |
| lduw [%o1], %o3 /* read word */ |
| addcc %o2, 3, %o2 /* restore count */ |
| bz,pt %XCC, .Lsmallexit |
| stw %o3, [%o0] /* write word */ |
| deccc %o2 /* reduce count for cc test */ |
| ldub [%o1+4], %o3 /* load one byte */ |
| bz,pt %XCC, .Lsmallexit |
| stb %o3, [%o0+4] /* store one byte */ |
| ldub [%o1+5], %o3 /* load second byte */ |
| deccc %o2 |
| bz,pt %XCC, .Lsmallexit |
| stb %o3, [%o0+5] /* store second byte */ |
| ldub [%o1+6], %o3 /* load third byte */ |
| stb %o3, [%o0+6] /* store third byte */ |
| .Lsmallexit: |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 32 |
| .Lsmallunalign: |
| cmp %o2, SHORTCHECK |
| ble,pn %XCC, .Lsmallrest |
| cmp %o2, SMALL_UMAX |
| bge,pt %XCC, .Lmedium_join |
| andcc %o1, 0x3, %o5 /* is src word aligned */ |
| bz,pn %XCC, .Laldst |
| cmp %o5, 2 /* is src half-word aligned */ |
| be,pt %XCC, .Ls2algn |
| cmp %o5, 3 /* src is byte aligned */ |
| .Ls1algn: |
| ldub [%o1], %o3 /* move 1 or 3 bytes to align it */ |
| inc 1, %o1 |
| stb %o3, [%o0] /* move a byte to align src */ |
| inc 1, %o0 |
| bne,pt %XCC, .Ls2algn |
| dec %o2 |
| b .Lald /* now go align dest */ |
| andcc %o0, 0x3, %o5 |
| |
| .Ls2algn: |
| lduh [%o1], %o3 /* know src is 2 byte aligned */ |
| inc 2, %o1 |
| srl %o3, 8, %o4 |
| stb %o4, [%o0] /* have to do bytes, */ |
| stb %o3, [%o0 + 1] /* do not know dst alignment */ |
| inc 2, %o0 |
| dec 2, %o2 |
| |
| .Laldst: |
| andcc %o0, 0x3, %o5 /* align the destination address */ |
| .Lald: |
| bz,pn %XCC, .Lw4cp |
| cmp %o5, 2 |
| be,pn %XCC, .Lw2cp |
| cmp %o5, 3 |
| .Lw3cp: lduw [%o1], %o4 |
| inc 4, %o1 |
| srl %o4, 24, %o5 |
| stb %o5, [%o0] |
| bne,pt %XCC, .Lw1cp |
| inc %o0 |
| dec 1, %o2 |
| andn %o2, 3, %o3 /* %o3 is aligned word count */ |
| dec 4, %o3 /* avoid reading beyond tail of src */ |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| 1: sll %o4, 8, %g5 /* save residual bytes */ |
| lduw [%o1+%o0], %o4 |
| deccc 4, %o3 |
| srl %o4, 24, %o5 /* merge with residual */ |
| or %o5, %g5, %g5 |
| st %g5, [%o0] |
| bnz,pt %XCC, 1b |
| inc 4, %o0 |
| sub %o1, 3, %o1 /* used one byte of last word read */ |
| and %o2, 3, %o2 |
| b 7f |
| inc 4, %o2 |
| |
| .Lw1cp: srl %o4, 8, %o5 |
| sth %o5, [%o0] |
| inc 2, %o0 |
| dec 3, %o2 |
| andn %o2, 3, %o3 /* %o3 is aligned word count */ |
| dec 4, %o3 /* avoid reading beyond tail of src */ |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| 2: sll %o4, 24, %g5 /* save residual bytes */ |
| lduw [%o1+%o0], %o4 |
| deccc 4, %o3 |
| srl %o4, 8, %o5 /* merge with residual */ |
| or %o5, %g5, %g5 |
| st %g5, [%o0] |
| bnz,pt %XCC, 2b |
| inc 4, %o0 |
| sub %o1, 1, %o1 /* used 3 bytes of last word read */ |
| and %o2, 3, %o2 |
| b 7f |
| inc 4, %o2 |
| |
| .Lw2cp: lduw [%o1], %o4 |
| inc 4, %o1 |
| srl %o4, 16, %o5 |
| sth %o5, [%o0] |
| inc 2, %o0 |
| dec 2, %o2 |
| andn %o2, 3, %o3 /* %o3 is aligned word count */ |
| dec 4, %o3 /* avoid reading beyond tail of src */ |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| 3: sll %o4, 16, %g5 /* save residual bytes */ |
| lduw [%o1+%o0], %o4 |
| deccc 4, %o3 |
| srl %o4, 16, %o5 /* merge with residual */ |
| or %o5, %g5, %g5 |
| st %g5, [%o0] |
| bnz,pt %XCC, 3b |
| inc 4, %o0 |
| sub %o1, 2, %o1 /* used two bytes of last word read */ |
| and %o2, 3, %o2 |
| b 7f |
| inc 4, %o2 |
| |
| .Lw4cp: andn %o2, 3, %o3 /* %o3 is aligned word count */ |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| |
| 1: lduw [%o1+%o0], %o4 /* read from address */ |
| deccc 4, %o3 /* decrement count */ |
| st %o4, [%o0] /* write at destination address */ |
| bgu,pt %XCC, 1b |
| inc 4, %o0 /* increment to address */ |
| and %o2, 3, %o2 /* number of leftover bytes, if any */ |
| |
| /* simple finish up byte copy, works with any alignment */ |
| 7: |
| add %o1, %o0, %o1 /* restore %o1 */ |
| .Lsmallrest: |
| tst %o2 |
| bz,pt %XCC, .Lsmallx |
| cmp %o2, 4 |
| blt,pn %XCC, .Lsmallleft3 |
| nop |
| sub %o2, 3, %o2 |
| .Lsmallnotalign4: |
| ldub [%o1], %o3 /* read byte */ |
| subcc %o2, 4, %o2 /* reduce count by 4 */ |
| stb %o3, [%o0] /* write byte */ |
| ldub [%o1+1], %o3 /* repeat for total of 4 bytes */ |
| add %o1, 4, %o1 /* advance SRC by 4 */ |
| stb %o3, [%o0+1] |
| ldub [%o1-2], %o3 |
| add %o0, 4, %o0 /* advance DST by 4 */ |
| stb %o3, [%o0-2] |
| ldub [%o1-1], %o3 |
| bgu,pt %XCC, .Lsmallnotalign4 /* loop til 3 or fewer bytes remain */ |
| stb %o3, [%o0-1] |
| addcc %o2, 3, %o2 /* restore count */ |
| bz,pt %XCC, .Lsmallx |
| .Lsmallleft3: /* 1, 2, or 3 bytes remain */ |
| subcc %o2, 1, %o2 |
| ldub [%o1], %o3 /* load one byte */ |
| bz,pt %XCC, .Lsmallx |
| stb %o3, [%o0] /* store one byte */ |
| ldub [%o1+1], %o3 /* load second byte */ |
| subcc %o2, 1, %o2 |
| bz,pt %XCC, .Lsmallx |
| stb %o3, [%o0+1] /* store second byte */ |
| ldub [%o1+2], %o3 /* load third byte */ |
| stb %o3, [%o0+2] /* store third byte */ |
| .Lsmallx: |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .Lsmallfin: |
| tst %o2 |
| bnz,pn %XCC, .Lsmallleft3 |
| nop |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 16 |
| .Lsmallwords: |
| lduw [%o1], %o3 /* read word */ |
| subcc %o2, 8, %o2 /* update count */ |
| stw %o3, [%o0] /* write word */ |
| add %o1, 8, %o1 /* update SRC */ |
| lduw [%o1-4], %o3 /* read word */ |
| add %o0, 8, %o0 /* update DST */ |
| bgu,pt %XCC, .Lsmallwords /* loop until done */ |
| stw %o3, [%o0-4] /* write word */ |
| addcc %o2, 7, %o2 /* restore count */ |
| bz,pt %XCC, .Lsmallexit /* check for completion */ |
| cmp %o2, 4 /* check for 4 or more bytes left */ |
| blt,pt %XCC, .Lsmallleft3 /* if not, go to finish up */ |
| nop |
| lduw [%o1], %o3 |
| add %o1, 4, %o1 |
| subcc %o2, 4, %o2 |
| add %o0, 4, %o0 |
| bnz,pn %XCC, .Lsmallleft3 |
| stw %o3, [%o0-4] |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 16 |
| .Lmedium: |
| .Lmedium_join: |
| neg %o0, %o5 |
| andcc %o5, 7, %o5 /* bytes till DST 8 byte aligned */ |
| brz,pt %o5, .Ldst_aligned_on_8 |
| |
| /* %o5 has the bytes to be written in partial store. */ |
| sub %o2, %o5, %o2 |
| sub %o1, %o0, %o1 /* %o1 gets the difference */ |
| 7: /* dst aligning loop */ |
| ldub [%o1+%o0], %o4 /* load one byte */ |
| subcc %o5, 1, %o5 |
| stb %o4, [%o0] |
| bgu,pt %XCC, 7b |
| add %o0, 1, %o0 /* advance dst */ |
| add %o1, %o0, %o1 /* restore %o1 */ |
| .Ldst_aligned_on_8: |
| andcc %o1, 7, %o5 |
| brnz,pt %o5, .Lsrc_dst_unaligned_on_8 |
| nop |
| |
| .Lsrc_dst_aligned_on_8: |
| /* check if we are copying MED_MAX or more bytes */ |
| cmp %o2, MED_MAX /* limit to store buffer size */ |
| bgu,pn %XCC, .Llarge_align8_copy |
| nop |
| /* |
| * Special case for handling when src and dest are both long word aligned |
| * and total data to move is less than MED_MAX bytes |
| */ |
| .Lmedlong: |
| subcc %o2, 63, %o2 /* adjust length to allow cc test */ |
| ble,pn %XCC, .Lmedl63 /* skip big loop if < 64 bytes */ |
| nop |
| .Lmedl64: |
| ldx [%o1], %o4 /* load */ |
| subcc %o2, 64, %o2 /* decrement length count */ |
| stx %o4, [%o0] /* and store */ |
| ldx [%o1+8], %o3 /* a block of 64 bytes */ |
| stx %o3, [%o0+8] |
| ldx [%o1+16], %o4 |
| stx %o4, [%o0+16] |
| ldx [%o1+24], %o3 |
| stx %o3, [%o0+24] |
| ldx [%o1+32], %o4 /* load */ |
| stx %o4, [%o0+32] /* and store */ |
| ldx [%o1+40], %o3 /* a block of 64 bytes */ |
| add %o1, 64, %o1 /* increase src ptr by 64 */ |
| stx %o3, [%o0+40] |
| ldx [%o1-16], %o4 |
| add %o0, 64, %o0 /* increase dst ptr by 64 */ |
| stx %o4, [%o0-16] |
| ldx [%o1-8], %o3 |
| bgu,pt %XCC, .Lmedl64 /* repeat if at least 64 bytes left */ |
| stx %o3, [%o0-8] |
| .Lmedl63: |
| addcc %o2, 32, %o2 /* adjust remaining count */ |
| ble,pt %XCC, .Lmedl31 /* to skip if 31 or fewer bytes left */ |
| nop |
| ldx [%o1], %o4 /* load */ |
| sub %o2, 32, %o2 /* decrement length count */ |
| stx %o4, [%o0] /* and store */ |
| ldx [%o1+8], %o3 /* a block of 32 bytes */ |
| add %o1, 32, %o1 /* increase src ptr by 32 */ |
| stx %o3, [%o0+8] |
| ldx [%o1-16], %o4 |
| add %o0, 32, %o0 /* increase dst ptr by 32 */ |
| stx %o4, [%o0-16] |
| ldx [%o1-8], %o3 |
| stx %o3, [%o0-8] |
| .Lmedl31: |
| addcc %o2, 16, %o2 /* adjust remaining count */ |
| ble,pt %XCC, .Lmedl15 /* skip if 15 or fewer bytes left */ |
| nop |
| ldx [%o1], %o4 /* load and store 16 bytes */ |
| add %o1, 16, %o1 /* increase src ptr by 16 */ |
| stx %o4, [%o0] |
| sub %o2, 16, %o2 /* decrease count by 16 */ |
| ldx [%o1-8], %o3 |
| add %o0, 16, %o0 /* increase dst ptr by 16 */ |
| stx %o3, [%o0-8] |
| .Lmedl15: |
| addcc %o2, 15, %o2 /* restore count */ |
| bz,pt %XCC, .Lsmallexit /* exit if finished */ |
| cmp %o2, 8 |
| blt,pt %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ |
| tst %o2 |
| ldx [%o1], %o4 /* load 8 bytes */ |
| add %o1, 8, %o1 /* increase src ptr by 8 */ |
| add %o0, 8, %o0 /* increase dst ptr by 8 */ |
| subcc %o2, 8, %o2 /* decrease count by 8 */ |
| bnz,pn %XCC, .Lmedw7 |
| stx %o4, [%o0-8] /* and store 8 bytes */ |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 16 |
| .Lsrc_dst_unaligned_on_8: |
| /* DST is 8-byte aligned, src is not */ |
| andcc %o1, 0x3, %o5 /* test word alignment */ |
| bnz,pt %XCC, .Lunalignsetup /* branch if not word aligned */ |
| nop |
| |
| /* |
| * Handle all cases where src and dest are aligned on word |
| * boundaries. Use unrolled loops for better performance. |
| * This option wins over standard large data move when |
| * source and destination is in cache for medium |
| * to short data moves. |
| */ |
| cmp %o2, MED_WMAX /* limit to store buffer size */ |
| bge,pt %XCC, .Lunalignrejoin /* otherwise rejoin main loop */ |
| nop |
| |
| subcc %o2, 31, %o2 /* adjust length to allow cc test */ |
| /* for end of loop */ |
| ble,pt %XCC, .Lmedw31 /* skip big loop if less than 16 */ |
| .Lmedw32: |
| ld [%o1], %o4 /* move a block of 32 bytes */ |
| sllx %o4, 32, %o5 |
| ld [%o1+4], %o4 |
| or %o4, %o5, %o5 |
| stx %o5, [%o0] |
| subcc %o2, 32, %o2 /* decrement length count */ |
| ld [%o1+8], %o4 |
| sllx %o4, 32, %o5 |
| ld [%o1+12], %o4 |
| or %o4, %o5, %o5 |
| stx %o5, [%o0+8] |
| add %o1, 32, %o1 /* increase src ptr by 32 */ |
| ld [%o1-16], %o4 |
| sllx %o4, 32, %o5 |
| ld [%o1-12], %o4 |
| or %o4, %o5, %o5 |
| stx %o5, [%o0+16] |
| add %o0, 32, %o0 /* increase dst ptr by 32 */ |
| ld [%o1-8], %o4 |
| sllx %o4, 32, %o5 |
| ld [%o1-4], %o4 |
| or %o4, %o5, %o5 |
| bgu,pt %XCC, .Lmedw32 /* repeat if at least 32 bytes left */ |
| stx %o5, [%o0-8] |
| .Lmedw31: |
| addcc %o2, 31, %o2 /* restore count */ |
| bz,pt %XCC, .Lsmallexit /* exit if finished */ |
| cmp %o2, 16 |
| blt,pt %XCC, .Lmedw15 |
| nop |
| ld [%o1], %o4 /* move a block of 16 bytes */ |
| sllx %o4, 32, %o5 |
| subcc %o2, 16, %o2 /* decrement length count */ |
| ld [%o1+4], %o4 |
| or %o4, %o5, %o5 |
| stx %o5, [%o0] |
| add %o1, 16, %o1 /* increase src ptr by 16 */ |
| ld [%o1-8], %o4 |
| add %o0, 16, %o0 /* increase dst ptr by 16 */ |
| sllx %o4, 32, %o5 |
| ld [%o1-4], %o4 |
| or %o4, %o5, %o5 |
| stx %o5, [%o0-8] |
| .Lmedw15: |
| bz,pt %XCC, .Lsmallexit /* exit if finished */ |
| cmp %o2, 8 |
| blt,pn %XCC, .Lmedw7 /* skip if 7 or fewer bytes left */ |
| tst %o2 |
| ld [%o1], %o4 /* load 4 bytes */ |
| subcc %o2, 8, %o2 /* decrease count by 8 */ |
| stw %o4, [%o0] /* and store 4 bytes */ |
| add %o1, 8, %o1 /* increase src ptr by 8 */ |
| ld [%o1-4], %o3 /* load 4 bytes */ |
| add %o0, 8, %o0 /* increase dst ptr by 8 */ |
| stw %o3, [%o0-4] /* and store 4 bytes */ |
| bz,pt %XCC, .Lsmallexit /* exit if finished */ |
| .Lmedw7: /* count is ge 1, less than 8 */ |
| cmp %o2, 4 /* check for 4 bytes left */ |
| blt,pn %XCC, .Lsmallleft3 /* skip if 3 or fewer bytes left */ |
| nop |
| ld [%o1], %o4 /* load 4 bytes */ |
| add %o1, 4, %o1 /* increase src ptr by 4 */ |
| add %o0, 4, %o0 /* increase dst ptr by 4 */ |
| subcc %o2, 4, %o2 /* decrease count by 4 */ |
| bnz,pt %XCC, .Lsmallleft3 |
| stw %o4, [%o0-4] /* and store 4 bytes */ |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 16 |
| .Llarge_align8_copy: /* Src and dst 8 byte aligned */ |
| /* align dst to 64 byte boundary */ |
| andcc %o0, 0x3f, %o3 /* check for dst 64 byte aligned */ |
| brz,pn %o3, .Laligned_to_64 |
| andcc %o0, 8, %o3 /* odd long words to move? */ |
| brz,pt %o3, .Laligned_to_16 |
| nop |
| ldx [%o1], %o4 |
| sub %o2, 8, %o2 |
| add %o1, 8, %o1 /* increment src ptr */ |
| add %o0, 8, %o0 /* increment dst ptr */ |
| stx %o4, [%o0-8] |
| .Laligned_to_16: |
| andcc %o0, 16, %o3 /* pair of long words to move? */ |
| brz,pt %o3, .Laligned_to_32 |
| nop |
| ldx [%o1], %o4 |
| sub %o2, 16, %o2 |
| stx %o4, [%o0] |
| add %o1, 16, %o1 /* increment src ptr */ |
| ldx [%o1-8], %o4 |
| add %o0, 16, %o0 /* increment dst ptr */ |
| stx %o4, [%o0-8] |
| .Laligned_to_32: |
| andcc %o0, 32, %o3 /* four long words to move? */ |
| brz,pt %o3, .Laligned_to_64 |
| nop |
| ldx [%o1], %o4 |
| sub %o2, 32, %o2 |
| stx %o4, [%o0] |
| ldx [%o1+8], %o4 |
| stx %o4, [%o0+8] |
| ldx [%o1+16], %o4 |
| stx %o4, [%o0+16] |
| add %o1, 32, %o1 /* increment src ptr */ |
| ldx [%o1-8], %o4 |
| add %o0, 32, %o0 /* increment dst ptr */ |
| stx %o4, [%o0-8] |
| .Laligned_to_64: |
| /* Following test is included to avoid issues where existing executables |
| * incorrectly call memcpy with overlapping src and dest instead of memmove |
| * |
| * if ( (src ge dst) and (dst+len > src)) go to overlap case |
| * if ( (src lt dst) and (src+len > dst)) go to overlap case |
| */ |
| cmp %o1,%o0 |
| bge,pt %XCC, 1f |
| nop |
| /* src+len > dst? */ |
| add %o1, %o2, %o4 |
| cmp %o4, %o0 |
| bgt,pt %XCC, .Lmv_aligned_on_64 |
| nop |
| ba 2f |
| nop |
| 1: |
| /* dst+len > src? */ |
| add %o0, %o2, %o4 |
| cmp %o4, %o1 |
| bgt,pt %XCC, .Lmv_aligned_on_64 |
| nop |
| 2: |
| /* handle non-overlapped copies |
| * |
| * Using block init store (BIS) instructions to avoid fetching cache |
| * lines from memory. Use ST_CHUNK stores to first element of each cache |
| * line (similar to prefetching) to avoid overfilling STQ or miss buffers. |
| * Gives existing cache lines time to be moved out of L1/L2/L3 cache. |
| */ |
| andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ |
| and %o2, 0x3f, %o2 /* residue bytes in %o2 */ |
| |
| /* We use ASI_STBIMRU_P for the first store to each cache line |
| * followed by ASI_STBI_P (mark as LRU) for the last store. That |
| * mixed approach reduces the chances the cache line is removed |
| * before we finish setting it, while minimizing the effects on |
| * other cached values during a large memcpy |
| * |
| * Intermediate stores can be normal since first BIS activates the |
| * cache line in the L2 cache. |
| * |
| * ST_CHUNK batches up initial BIS operations for several cache lines |
| * to allow multiple requests to not be blocked by overflowing the |
| * the store miss buffer. Then the matching stores for all those |
| * BIS operations are executed. |
| */ |
| |
| .Lalign_loop: |
| cmp %o5, ST_CHUNK*64 |
| blu,pt %XCC, .Lalign_short |
| mov ST_CHUNK, %o3 |
| sllx %o3, 6, %g5 /* ST_CHUNK*64 */ |
| |
| .Lalign_loop_start: |
| prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 |
| subcc %o3, 2, %o3 |
| ldx [%o1], %o4 |
| add %o1, 128, %o1 |
| EX_ST(STORE_ASI(%o4, %o0)) |
| add %o0, 64, %o0 |
| ldx [%o1-64], %o4 |
| EX_ST(STORE_ASI(%o4, %o0)) |
| add %o0, 64, %o0 |
| bgu,pt %XCC, .Lalign_loop_start |
| prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21 |
| |
| mov ST_CHUNK, %o3 |
| sub %o1, %g5, %o1 /* reset %o1 */ |
| sub %o0, %g5, %o0 /* reset %o0 */ |
| |
| sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ |
| .Lalign_loop_rest: |
| ldx [%o1+8],%o4 |
| add %o0, 64, %o0 |
| stx %o4, [%o0-48] |
| subcc %o3, 1, %o3 |
| ldx [%o1+16],%o4 |
| stx %o4, [%o0-40] |
| sub %o5, 64, %o5 |
| ldx [%o1+24],%o4 |
| stx %o4, [%o0-32] |
| ldx [%o1+32],%o4 |
| stx %o4, [%o0-24] |
| ldx [%o1+40],%o4 |
| stx %o4, [%o0-16] |
| ldx [%o1+48],%o4 |
| stx %o4, [%o0-8] |
| add %o1, 64, %o1 |
| ldx [%o1-8],%o4 |
| bgu,pt %XCC, .Lalign_loop_rest |
| EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ |
| |
| mov ST_CHUNK, %o3 |
| cmp %o5, ST_CHUNK*64 |
| bgu,pt %XCC, .Lalign_loop_start |
| add %o0, 8, %o0 /* restore %o0 from ASI alignment */ |
| |
| cmp %o5, 0 |
| beq,pt %XCC, .Lalign_done |
| |
| /* no prefetches needed in these loops |
| * since we are within ALIGN_PRE of the end */ |
| .Lalign_short: |
| srl %o5, 6, %o3 |
| .Lalign_loop_short: |
| subcc %o3, 1, %o3 |
| ldx [%o1], %o4 |
| add %o1, 64, %o1 |
| EX_ST(STORE_ASI(%o4, %o0)) |
| bgu,pt %XCC, .Lalign_loop_short |
| add %o0, 64, %o0 |
| |
| sub %o1, %o5, %o1 /* reset %o1 */ |
| sub %o0, %o5, %o0 /* reset %o0 */ |
| |
| sub %o0, 8, %o0 /* adjust %o0 for ASI alignment */ |
| .Lalign_short_rest: |
| ldx [%o1+8],%o4 |
| add %o0, 64, %o0 |
| stx %o4, [%o0-48] |
| ldx [%o1+16],%o4 |
| subcc %o5, 64, %o5 |
| stx %o4, [%o0-40] |
| ldx [%o1+24],%o4 |
| stx %o4, [%o0-32] |
| ldx [%o1+32],%o4 |
| stx %o4, [%o0-24] |
| ldx [%o1+40],%o4 |
| stx %o4, [%o0-16] |
| ldx [%o1+48],%o4 |
| stx %o4, [%o0-8] |
| add %o1, 64, %o1 |
| ldx [%o1-8],%o4 |
| bgu,pt %XCC, .Lalign_short_rest |
| EX_ST(STORE_INIT(%o4,%o0)) /* mark cache line as LRU */ |
| |
| add %o0, 8, %o0 /* restore %o0 from ASI alignment */ |
| |
| .Lalign_done: |
| cmp %o2, 0 |
| membar #StoreStore |
| bne,pt %XCC, .Lmedl63 |
| subcc %o2, 63, %o2 /* adjust length to allow cc test */ |
| retl |
| mov EX_RETVAL(%g1), %o0 /* restore %o0 */ |
| |
| .align 16 |
| /* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX */ |
| /* Since block load/store and BIS are not in use for unaligned data, |
| * no need to align dst on 64 byte cache line boundary */ |
| .Lunalignsetup: |
| .Lunalignrejoin: |
| rd %fprs, %g5 /* check for unused fp */ |
| /* if fprs.fef == 0, set it. |
| * Setting it when already set costs more than checking */ |
| andcc %g5, FPRS_FEF, %g5 /* test FEF, fprs.du = fprs.dl = 0 */ |
| bz,a %XCC, 1f |
| wr %g0, FPRS_FEF, %fprs /* fprs.fef = 1 */ |
| 1: |
| andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */ |
| and %o2, 0x3f, %o2 /* residue bytes in %o2 */ |
| cmp %o2, 8 /* Insure we do not load beyond */ |
| bgt,pt %XCC, .Lunalign_adjust /* end of source buffer */ |
| andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ |
| add %o2, 64, %o2 /* adjust to leave loop */ |
| sub %o5, 64, %o5 /* early if necessary */ |
| .Lunalign_adjust: |
| alignaddr %o1, %g0, %g0 /* generate %gsr */ |
| add %o1, %o5, %o1 /* advance %o1 to after blocks */ |
| ldd [%o4], %f0 |
| .Lunalign_loop: |
| prefetch [%o0 + (9 * BLOCK_SIZE)], 20 |
| ldd [%o4+8], %f2 |
| faligndata %f0, %f2, %f16 |
| ldd [%o4+16], %f4 |
| subcc %o5, BLOCK_SIZE, %o5 |
| std %f16, [%o0] |
| faligndata %f2, %f4, %f18 |
| ldd [%o4+24], %f6 |
| std %f18, [%o0+8] |
| faligndata %f4, %f6, %f20 |
| ldd [%o4+32], %f8 |
| std %f20, [%o0+16] |
| faligndata %f6, %f8, %f22 |
| ldd [%o4+40], %f10 |
| std %f22, [%o0+24] |
| faligndata %f8, %f10, %f24 |
| ldd [%o4+48], %f12 |
| std %f24, [%o0+32] |
| faligndata %f10, %f12, %f26 |
| ldd [%o4+56], %f14 |
| add %o4, BLOCK_SIZE, %o4 |
| std %f26, [%o0+40] |
| faligndata %f12, %f14, %f28 |
| ldd [%o4], %f0 |
| std %f28, [%o0+48] |
| faligndata %f14, %f0, %f30 |
| std %f30, [%o0+56] |
| add %o0, BLOCK_SIZE, %o0 |
| bgu,pt %XCC, .Lunalign_loop |
| prefetch [%o4 + (11 * BLOCK_SIZE)], 20 |
| |
| /* Handle trailing bytes, 64 to 127 |
| * Dest long word aligned, Src not long word aligned */ |
| cmp %o2, 15 |
| bleu,pt %XCC, .Lunalign_short |
| |
| andn %o2, 0x7, %o5 /* %o5 is multiple of 8 */ |
| and %o2, 0x7, %o2 /* residue bytes in %o2 */ |
| add %o2, 8, %o2 |
| sub %o5, 8, %o5 /* do not load past end of src */ |
| andn %o1, 0x7, %o4 /* %o4 has 8 byte aligned src addr */ |
| add %o1, %o5, %o1 /* move %o1 to after multiple of 8 */ |
| ldd [%o4], %f0 /* fetch partial word */ |
| .Lunalign_by8: |
| ldd [%o4+8], %f2 |
| add %o4, 8, %o4 |
| faligndata %f0, %f2, %f16 |
| subcc %o5, 8, %o5 |
| std %f16, [%o0] |
| fsrc2 %f2, %f0 |
| bgu,pt %XCC, .Lunalign_by8 |
| add %o0, 8, %o0 |
| |
| .Lunalign_short: /* restore fprs state */ |
| brnz,pt %g5, .Lsmallrest |
| nop |
| ba .Lsmallrest |
| wr %g5, %g0, %fprs |
| END(__memcpy_niagara7) |
| |
| #endif |