google3/third_party/grte/v5_src/glibc-2.27/sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S - GRTEv5 - Git at Google

 /* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
    Copyright (C) 2017-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */

 #include <sysdep.h>

 #ifndef XCC
 # define XCC    xcc
 #endif
 	.register	%g2,#scratch
 	.register	%g3,#scratch
 	.register	%g6,#scratch

 #define	FPRS_FEF	0x04

 /*
  * ASI_STBI_P marks the cache line as "least recently used"
  * which means if many threads are active, it has a high chance
  * of being pushed out of the cache between the first initializing
  * store and the final stores.
  * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
  * cache line as "most recently used" for all but the last cache
  * line.
  */

 #define	ASI_BLK_INIT_QUAD_LDD_P	0xe2
 #define	ASI_ST_BLK_INIT_MRU_P	0xf2

 #define	ASI_STBI_P	ASI_BLK_INIT_QUAD_LDD_P
 #define	ASI_STBIMRU_P	ASI_ST_BLK_INIT_MRU_P

 #define	BLOCK_SIZE	64	/* L2 data cache line size  */
 #define	SHORTCOPY	3
 #define	SHORTCHECK	14
 #define	SHORT_LONG	64	/* max copy for short longword-aligned case  */
 				/* must be at least 64  */
 #define	SMALL_MAX	255	/* max small copy for word/long aligned  */
 #define	SMALL_UMAX	128	/* max small copy for unaligned case  */
 #define	MED_WMAX	1023	/* max copy for medium word-aligned case  */
 #define	MED_MAX		511	/* max copy for medium longword-aligned case  */
 #define	ST_CHUNK	20	/* ST_CHUNK - block of values for BIS Store  */
 /* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
  * prefetch 20 can cause inst pipeline to delay if data is in memory
  * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache  */
 #define	ALIGN_PRE	20	/* distance for aligned prefetch loop  */

 #define EX_ST(x)	x
 #define EX_RETVAL(x)	x
 #define STORE_ASI(src,addr)	stxa src, [addr] ASI_STBIMRU_P
 #define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P

 #if IS_IN (libc)

 	.text

 ENTRY(__memmove_niagara7)
 	/* %o0=dst, %o1=src, %o2=len */
 	cmp	%o1, %o0	/* if from address is >= to use forward copy  */
 	bgeu,pn	%XCC, .Lforcpy	/* else use backward if ...  */
 	 sub	%o0, %o1, %o4	/* get difference of two addresses  */
 	cmp	%o2, %o4	/* compare size and difference of addresses  */
 	bleu,pn	%XCC, .Lforcpy	/* if size is bigger, do overlapped copy  */
 	 add	%o1, %o2, %o5	/* get to end of source space  */

 /* an overlapped copy that must be done "backwards"  */
 .Lchksize:
 	cmp	%o2, 8			/* less than 8 byte do byte copy  */
 	blu,pn %XCC, 2f			/* else continue  */

 /* Now size is bigger than 8  */
 .Ldbalign:
 	 add	%o0, %o2, %g1		/* get to end of dest space  */
 	andcc	%g1, 7, %o3		/* %o3 has cnt til dst 8 byte align  */
 	bz,a,pn	%XCC, .Ldbbck		/* skip if dst is 8 byte aligned  */
 	 andn	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
 	sub	%o2, %o3, %o2		/* update o2 with new count  */

 1:	dec	%o5			/* decrement source  */
 	ldub	[%o5], %g1		/* load one byte  */
 	deccc	%o3			/* decrement count  */
 	bgu,pt	%XCC, 1b		/* if not done keep copying  */
 	 stb	%g1, [%o5+%o4]		/* store one byte into dest  */
 	andncc	%o2, 7, %o3		/* force %o3 cnt to multiple of 8  */
 	bz,pn	%XCC, 2f		/* if size < 8, move to byte copy  */

 /* Now Destination is 8 byte aligned  */
 .Ldbbck:
 	 andcc	%o5, 7, %o0		/* %o0 has src offset  */
 	bz,a,pn	%XCC, .Ldbcopybc	/* if src is aligned do fast memmove  */
 	 sub	%o2, %o3, %o2		/* Residue bytes in %o2  */

 .Lcpy_dbwdbc:				/* alignment of src is needed  */
 	sub	%o2, 8, %o2		/* set size one loop ahead  */
 	sll	%o0, 3, %g1		/* %g1 is left shift  */
 	mov	64, %g5			/* init %g5 to be 64  */
 	sub	%g5, %g1, %g5		/* %g5 rightshift = (64 - leftshift)  */
 	sub	%o5, %o0, %o5		/* align the src at 8 bytes.  */
 	add	%o4, %o0, %o4		/* increase diff between src & dst  */
 	ldx	[%o5], %o1		/* load first 8 bytes  */
 	srlx	%o1, %g5, %o1
 1:	sub	%o5, 8, %o5		/* subtract 8 from src  */
 	ldx	[%o5], %o0		/* load 8 byte  */
 	sllx	%o0, %g1, %o3		/* shift loaded val left to tmp reg  */
 	or	%o1, %o3, %o3		/* align data  */
 	stx	%o3, [%o5+%o4]		/* store 8 byte  */
 	subcc	%o2, 8, %o2		/* subtract 8 byte from size  */
 	bg,pt	%XCC, 1b		/* if size > 0 continue  */
 	 srlx	%o0, %g5, %o1		/* move extra byte for the next use  */

 	srl	%g1, 3, %o0		/* restore %o0 value for alignment  */
 	add	%o5, %o0, %o5		/* restore src alignment  */
 	sub	%o4, %o0, %o4		/* restore diff between src & dest  */

 	ba	2f			/* branch to the trailing byte copy  */
 	 add	%o2, 8, %o2		/* restore size value  */

 .Ldbcopybc:				/* alignment of src is not needed  */
 1:	sub	%o5, 8, %o5		/* subtract from src  */
 	ldx	[%o5], %g1		/* load 8 bytes  */
 	subcc	%o3, 8, %o3		/* subtract from size  */
 	bgu,pt	%XCC, 1b		/* if size is bigger 0 continue  */
 	 stx	%g1, [%o5+%o4]		/* store 8 bytes to destination  */

 	ba	2f
 	 nop

 .Lbcbyte:
 1:	ldub	[%o5], %g1		/* load one byte  */
 	stb	%g1, [%o5+%o4]		/* store one byte  */
 2:	deccc	%o2			/* decrement size  */
 	bgeu,a,pt %XCC, 1b		/* if size is >= 0 continue  */
 	 dec	%o5			/* decrement from address  */

 .Lexitbc:				/* exit from backward copy  */
 	retl
 	 add	%o5, %o4, %o0		/* restore dest addr  */


 /* Check to see if memmove is large aligned copy
  * If so, use special version of copy that avoids
  * use of block store init.  */
 .Lforcpy:
 	cmp	%o2, SMALL_MAX		/* check for not small case  */
 	blt,pn	%XCC, .Lmv_short	/* merge with memcpy  */
 	 mov	%o0, %g1		/* save %o0  */
 	neg	%o0, %o5
 	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
 	brz,pt	%o5, .Lmv_dst_aligned_on_8

 /* %o5 has the bytes to be written in partial store.  */
 	 sub	%o2, %o5, %o2
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 7:					/* dst aligning loop  */
 	ldub	[%o1+%o0], %o4		/* load one byte  */
 	subcc	%o5, 1, %o5
 	stb	%o4, [%o0]
 	bgu,pt	%XCC, 7b
 	 add	%o0, 1, %o0		/* advance dst  */
 	add	%o1, %o0, %o1		/* restore %o1  */
 .Lmv_dst_aligned_on_8:
 	andcc	%o1, 7, %o5
 	brnz,pn	%o5, .Lsrc_dst_unaligned_on_8
 	 prefetch [%o1 + (1 * BLOCK_SIZE)], 20

 .Lmv_src_dst_aligned_on_8:
 /* check if we are copying MED_MAX or more bytes  */
 	cmp	%o2, MED_MAX		/* limit to store buffer size  */
 	bleu,pt	%XCC, .Lmedlong
 	 prefetch [%o1 + (2 * BLOCK_SIZE)], 20

 /* The mv_align loop below mimics the memcpy code for large aligned copies,
  * but does not use the ASI_STBI_P (block initializing store) performance
  * optimization.  This is used when memcpy is incorrectly invoked with
  * overlapping buffers.  */

 .Lmv_large_align8_copy:			/* Src and dst share 8 byte align  */
 					/* align dst to 64 byte boundary  */
 	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
 	brz,pn	%o3, .Lmv_aligned_on_64
 	 sub	%o3, 64, %o3		/* %o3 has negative bytes to move  */
 	add	%o2, %o3, %o2		/* adjust remaining count  */
 .Lmv_align_to_64:
 	ldx	[%o1], %o4
 	add	%o1, 8, %o1		/* increment src ptr  */
 	addcc	%o3, 8, %o3
 	stx	%o4, [%o0]
 	brnz,pt	%o3, .Lmv_align_to_64
 	 add	%o0, 8, %o0		/* increment dst ptr  */

 .Lmv_aligned_on_64:
 	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
 .Lmv_align_loop:
 	ldx	[%o1],%o4
 	stx	%o4,[%o0]
 	prefetch [%o0 + (10 * BLOCK_SIZE)], 22
 	prefetch [%o1 + (10 * BLOCK_SIZE)], 21
 	subcc	%o5, 64, %o5
 	ldx	[%o1+8],%o4
 	stx	%o4,[%o0+8]
 	ldx	[%o1+16],%o4
 	stx	%o4,[%o0+16]
 	ldx	[%o1+24],%o4
 	stx	%o4,[%o0+24]
 	ldx	[%o1+32],%o4
 	stx	%o4,[%o0+32]
 	ldx	[%o1+40],%o4
 	stx	%o4,[%o0+40]
 	ldx	[%o1+48],%o4
 	add	%o1, 64, %o1
 	stx	%o4,[%o0+48]
 	add	%o0, 64, %o0
 	ldx	[%o1-8],%o4
 	bgt,pt	%XCC, .Lmv_align_loop
 	 stx	%o4,[%o0-8]

 	ba	.Lmedlong
 	 nop
 END(__memmove_niagara7)

 ENTRY(__mempcpy_niagara7)
 	/* %o0=dst, %o1=src, %o2=len */
 	ba,pt	%icc, 101f
 	 add	%o0, %o2, %g1		/* save dst + len  */
 END(__mempcpy_niagara7)

 	.align	32
 ENTRY(__memcpy_niagara7)
 100:	/* %o0=dst, %o1=src, %o2=len */
 	mov	%o0, %g1		/* save %o0  */
 101:
 #ifndef __arch64__
 	srl	%o2, 0, %o2
 #endif
 	cmp	%o2, SMALL_MAX		/* check for not small case  */
 	bgeu,pn	%XCC, .Lmedium		/* go to larger cases  */
 .Lmv_short:
 	 cmp	%o2, SHORTCOPY		/* check for really short case  */
 	ble,pn	%XCC, .Lsmallfin
 	 or	%o0, %o1, %o4		/* prepare alignment check  */
 	andcc	%o4, 0x3, %o5		/* test for word alignment  */
 	bnz,pn	%XCC, .Lsmallunalign	/* branch to non-word aligned case  */
 	 nop
 	subcc	%o2, 7, %o2		/* adjust count  */
 	ble,pn	%XCC, .Lsmallwordx
 	 andcc	%o4, 0x7, %o5		/* test for long alignment  */
 /* 8 or more bytes, src and dest start on word boundary
  * %o4 contains or %o0, %o1  */
 .Lsmalllong:
 	bnz,pn	%XCC, .Lsmallwords	/* branch to word aligned case  */
 	 cmp	%o2, SHORT_LONG-7
 	bge,a	%XCC, .Lmedl64		/* if we branch  */
 	 sub	%o2,56,%o2		/* adjust %o2 to -63 off count  */

 /* slightly unroll the small_long_loop to improve very short copies  */
 	cmp	%o2, 32-7
 	blt,a,pn %XCC, .Lsmall_long_l
 	 sub	%o1, %o0, %o1		/* %o1 gets the difference  */

 	ldx	[%o1], %o5
 	ldx	[%o1+8], %o4
 	ldx	[%o1+16], %o3

 	subcc	%o2, 24, %o2
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

 	stx	%o5, [%o0]		/* write word  */
 	stx	%o4, [%o0+8]		/* write word  */
 	stx	%o3, [%o0+16]		/* write word  */

 	add	%o0, 24, %o0

 /* end loop unroll  */

 .Lsmall_long_l:
 	ldx	[%o1+%o0], %o3
 	subcc	%o2, 8, %o2
 	add	%o0, 8, %o0
 	bgu,pn	%XCC, .Lsmall_long_l	/* loop until done  */
 	 stx	%o3, [%o0-8]		/* write word  */
 	addcc	%o2, 7, %o2		/* restore %o2 to correct count  */
 	bnz,pn	%XCC, .Lsmall_long_x	/* check for completion  */
 	 add	%o1, %o0, %o1		/* restore %o1  */
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
 .Lsmall_long_x:
 	cmp	%o2, 4			/* check for 4 or more bytes left  */
 	blt,pn	%XCC, .Lsmallleft3	/* if not, go to finish up  */
 	 nop
 	lduw	[%o1], %o3
 	add	%o1, 4, %o1
 	subcc	%o2, 4, %o2
 	stw	%o3, [%o0]
 	bnz,pn	%XCC, .Lsmallleft3
 	 add	%o0, 4, %o0
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 32
 /* src and dest start on word boundary; 7 or fewer bytes  */
 .Lsmallwordx:
 	lduw	[%o1], %o3		/* read word  */
 	addcc	%o2, 3, %o2		/* restore count  */
 	bz,pt	%XCC, .Lsmallexit
 	 stw	%o3, [%o0]		/* write word  */
 	deccc	%o2			/* reduce count for cc test  */
 	ldub	[%o1+4], %o3		/* load one byte  */
 	bz,pt	%XCC, .Lsmallexit
 	 stb	%o3, [%o0+4]		/* store one byte  */
 	ldub	[%o1+5], %o3		/* load second byte  */
 	deccc	%o2
 	bz,pt	%XCC, .Lsmallexit
 	 stb	%o3, [%o0+5]		/* store second byte  */
 	ldub	[%o1+6], %o3		/* load third byte  */
 	stb	%o3, [%o0+6]		/* store third byte  */
 .Lsmallexit:
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 32
 .Lsmallunalign:
 	cmp	%o2, SHORTCHECK
 	ble,pn	%XCC, .Lsmallrest
 	 cmp	%o2, SMALL_UMAX
 	bge,pt	%XCC, .Lmedium_join
 	 andcc	%o1, 0x3, %o5		/* is src word aligned  */
 	bz,pn	%XCC, .Laldst
 	 cmp	%o5, 2			/* is src half-word aligned  */
 	be,pt	%XCC, .Ls2algn
 	 cmp	%o5, 3			/* src is byte aligned  */
 .Ls1algn:
 	ldub	[%o1], %o3		/* move 1 or 3 bytes to align it  */
 	inc	1, %o1
 	stb	%o3, [%o0]		/* move a byte to align src  */
 	inc	1, %o0
 	bne,pt	%XCC, .Ls2algn
 	 dec	%o2
 	b	.Lald			/* now go align dest  */
 	 andcc	%o0, 0x3, %o5

 .Ls2algn:
 	lduh	[%o1], %o3		/* know src is 2 byte aligned  */
 	inc	2, %o1
 	srl	%o3, 8, %o4
 	stb	%o4, [%o0]		/* have to do bytes,  */
 	stb	%o3, [%o0 + 1]		/* do not know dst alignment  */
 	inc	2, %o0
 	dec	2, %o2

 .Laldst:
 	andcc	%o0, 0x3, %o5		/* align the destination address  */
 .Lald:
 	bz,pn	%XCC, .Lw4cp
 	 cmp	%o5, 2
 	be,pn	%XCC, .Lw2cp
 	 cmp	%o5, 3
 .Lw3cp:	lduw	[%o1], %o4
 	inc	4, %o1
 	srl	%o4, 24, %o5
 	stb	%o5, [%o0]
 	bne,pt	%XCC, .Lw1cp
 	 inc	%o0
 	dec	1, %o2
 	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 	dec	4, %o3			/* avoid reading beyond tail of src  */
 	sub	%o1, %o0, %o1		/*  %o1 gets the difference  */

 1:	sll	%o4, 8, %g5		/* save residual bytes  */
 	lduw	[%o1+%o0], %o4
 	deccc	4, %o3
 	srl	%o4, 24, %o5		/* merge with residual  */
 	or	%o5, %g5, %g5
 	st	%g5, [%o0]
 	bnz,pt	%XCC, 1b
 	 inc	4, %o0
 	sub	%o1, 3, %o1		/* used one byte of last word read  */
 	and	%o2, 3, %o2
 	b	7f
 	 inc	4, %o2

 .Lw1cp:	srl	%o4, 8, %o5
 	sth	%o5, [%o0]
 	inc	2, %o0
 	dec	3, %o2
 	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 	dec	4, %o3			/* avoid reading beyond tail of src  */
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

 2:	sll	%o4, 24, %g5		/* save residual bytes  */
 	lduw	[%o1+%o0], %o4
 	deccc	4, %o3
 	srl	%o4, 8, %o5		/* merge with residual  */
 	or	%o5, %g5, %g5
 	st	%g5, [%o0]
 	bnz,pt	%XCC, 2b
 	 inc	4, %o0
 	sub	%o1, 1, %o1		/* used 3 bytes of last word read  */
 	and	%o2, 3, %o2
 	b	7f
 	 inc	4, %o2

 .Lw2cp:	lduw	[%o1], %o4
 	inc	4, %o1
 	srl	%o4, 16, %o5
 	sth	%o5, [%o0]
 	inc	2, %o0
 	dec	2, %o2
 	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 	dec	4, %o3			/* avoid reading beyond tail of src  */
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

 3:	sll	%o4, 16, %g5		/* save residual bytes  */
 	lduw	[%o1+%o0], %o4
 	deccc	4, %o3
 	srl	%o4, 16, %o5		/* merge with residual  */
 	or	%o5, %g5, %g5
 	st	%g5, [%o0]
 	bnz,pt	%XCC, 3b
 	 inc	4, %o0
 	sub	%o1, 2, %o1		/* used two bytes of last word read  */
 	and	%o2, 3, %o2
 	b	7f
 	 inc	4, %o2

 .Lw4cp:	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */

 1:	lduw	[%o1+%o0], %o4		/* read from address  */
 	deccc	4, %o3			/* decrement count  */
 	st	%o4, [%o0]		/* write at destination address  */
 	bgu,pt	%XCC, 1b
 	 inc	4, %o0			/* increment to address  */
 	and	%o2, 3, %o2		/* number of leftover bytes, if any  */

 	/* simple finish up byte copy, works with any alignment  */
 7:
 	add	%o1, %o0, %o1		/* restore %o1  */
 .Lsmallrest:
 	tst	%o2
 	bz,pt	%XCC, .Lsmallx
 	 cmp	%o2, 4
 	blt,pn	%XCC, .Lsmallleft3
 	 nop
 	sub	%o2, 3, %o2
 .Lsmallnotalign4:
 	ldub	[%o1], %o3		/* read byte  */
 	subcc	%o2, 4, %o2		/* reduce count by 4  */
 	stb	%o3, [%o0]		/* write byte  */
 	ldub	[%o1+1], %o3		/* repeat for total of 4 bytes  */
 	add	%o1, 4, %o1		/* advance SRC by 4  */
 	stb	%o3, [%o0+1]
 	ldub	[%o1-2], %o3
 	add	%o0, 4, %o0		/* advance DST by 4  */
 	stb	%o3, [%o0-2]
 	ldub	[%o1-1], %o3
 	bgu,pt	%XCC, .Lsmallnotalign4	/* loop til 3 or fewer bytes remain  */
 	 stb	%o3, [%o0-1]
 	addcc	%o2, 3, %o2		/* restore count  */
 	bz,pt	%XCC, .Lsmallx
 .Lsmallleft3:				/* 1, 2, or 3 bytes remain  */
 	 subcc	%o2, 1, %o2
 	ldub	[%o1], %o3		/* load one byte  */
 	bz,pt	%XCC, .Lsmallx
 	 stb	%o3, [%o0]		/* store one byte  */
 	ldub	[%o1+1], %o3		/* load second byte  */
 	subcc	%o2, 1, %o2
 	bz,pt	%XCC, .Lsmallx
 	 stb	%o3, [%o0+1]		/* store second byte  */
 	ldub	[%o1+2], %o3		/* load third byte  */
 	stb	%o3, [%o0+2]		/* store third byte  */
 .Lsmallx:
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 .Lsmallfin:
 	tst	%o2
 	bnz,pn	%XCC, .Lsmallleft3
 	 nop
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 16
 .Lsmallwords:
 	lduw	[%o1], %o3		/* read word  */
 	subcc	%o2, 8, %o2		/* update count  */
 	stw	%o3, [%o0]		/* write word  */
 	add	%o1, 8, %o1		/* update SRC  */
 	lduw	[%o1-4], %o3		/* read word  */
 	add	%o0, 8, %o0		/* update DST  */
 	bgu,pt	%XCC, .Lsmallwords	/* loop until done  */
 	 stw	%o3, [%o0-4]		/* write word  */
 	addcc	%o2, 7, %o2		/* restore count  */
 	bz,pt	%XCC, .Lsmallexit	/* check for completion  */
 	 cmp	%o2, 4			/* check for 4 or more bytes left  */
 	blt,pt	%XCC, .Lsmallleft3	/* if not, go to finish up  */
 	 nop
 	lduw	[%o1], %o3
 	add	%o1, 4, %o1
 	subcc	%o2, 4, %o2
 	add	%o0, 4, %o0
 	bnz,pn	%XCC, .Lsmallleft3
 	 stw	%o3, [%o0-4]
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 16
 .Lmedium:
 .Lmedium_join:
 	neg	%o0, %o5
 	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
 	brz,pt	%o5, .Ldst_aligned_on_8

 	/* %o5 has the bytes to be written in partial store.  */
 	 sub	%o2, %o5, %o2
 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
 7:					/* dst aligning loop  */
 	ldub	[%o1+%o0], %o4		/* load one byte  */
 	subcc	%o5, 1, %o5
 	stb	%o4, [%o0]
 	bgu,pt	%XCC, 7b
 	 add	%o0, 1, %o0		/* advance dst  */
 	add	%o1, %o0, %o1		/* restore %o1  */
 .Ldst_aligned_on_8:
 	andcc	%o1, 7, %o5
 	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
 	 nop

 .Lsrc_dst_aligned_on_8:
 	/* check if we are copying MED_MAX or more bytes  */
 	cmp	%o2, MED_MAX		/* limit to store buffer size  */
 	bgu,pn	%XCC, .Llarge_align8_copy
 	 nop
 /*
  * Special case for handling when src and dest are both long word aligned
  * and total data to move is less than MED_MAX bytes
  */
 .Lmedlong:
 	subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
 	ble,pn	%XCC, .Lmedl63		/* skip big loop if < 64 bytes  */
 	 nop
 .Lmedl64:
 	ldx	[%o1], %o4		/* load  */
 	subcc	%o2, 64, %o2		/* decrement length count  */
 	stx	%o4, [%o0]		/* and store  */
 	ldx	[%o1+8], %o3		/* a block of 64 bytes  */
 	stx	%o3, [%o0+8]
 	ldx	[%o1+16], %o4
 	stx	%o4, [%o0+16]
 	ldx	[%o1+24], %o3
 	stx	%o3, [%o0+24]
 	ldx	[%o1+32], %o4		/* load  */
 	stx	%o4, [%o0+32]		/* and store  */
 	ldx	[%o1+40], %o3		/* a block of 64 bytes  */
 	add	%o1, 64, %o1		/* increase src ptr by 64  */
 	stx	%o3, [%o0+40]
 	ldx	[%o1-16], %o4
 	add	%o0, 64, %o0		/* increase dst ptr by 64  */
 	stx	%o4, [%o0-16]
 	ldx	[%o1-8], %o3
 	bgu,pt	%XCC, .Lmedl64		/* repeat if at least 64 bytes left  */
 	 stx	%o3, [%o0-8]
 .Lmedl63:
 	addcc	%o2, 32, %o2		/* adjust remaining count  */
 	ble,pt	%XCC, .Lmedl31		/* to skip if 31 or fewer bytes left  */
 	 nop
 	ldx	[%o1], %o4		/* load  */
 	sub	%o2, 32, %o2		/* decrement length count  */
 	stx	%o4, [%o0]		/* and store  */
 	ldx	[%o1+8], %o3		/* a block of 32 bytes  */
 	add	%o1, 32, %o1		/* increase src ptr by 32  */
 	stx	%o3, [%o0+8]
 	ldx	[%o1-16], %o4
 	add	%o0, 32, %o0		/* increase dst ptr by 32  */
 	stx	%o4, [%o0-16]
 	ldx	[%o1-8], %o3
 	stx	%o3, [%o0-8]
 .Lmedl31:
 	addcc	%o2, 16, %o2		/* adjust remaining count  */
 	ble,pt	%XCC, .Lmedl15		/* skip if 15 or fewer bytes left  */
 	 nop
 	ldx	[%o1], %o4		/* load and store 16 bytes  */
 	add	%o1, 16, %o1		/* increase src ptr by 16  */
 	stx	%o4, [%o0]
 	sub	%o2, 16, %o2		/* decrease count by 16  */
 	ldx	[%o1-8], %o3
 	add	%o0, 16, %o0		/* increase dst ptr by 16  */
 	stx	%o3, [%o0-8]
 .Lmedl15:
 	addcc	%o2, 15, %o2		/* restore count  */
 	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 	 cmp	%o2, 8
 	blt,pt	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
 	 tst	%o2
 	ldx	[%o1], %o4		/* load 8 bytes  */
 	add	%o1, 8, %o1		/* increase src ptr by 8  */
 	add	%o0, 8, %o0		/* increase dst ptr by 8  */
 	subcc	%o2, 8, %o2		/* decrease count by 8  */
 	bnz,pn	%XCC, .Lmedw7
 	 stx	%o4, [%o0-8]		/* and store 8 bytes  */
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 16
 .Lsrc_dst_unaligned_on_8:
 	/* DST is 8-byte aligned, src is not  */
 	andcc	%o1, 0x3, %o5		/* test word alignment  */
 	bnz,pt	%XCC, .Lunalignsetup	/* branch if not word aligned  */
 	 nop

 /*
  * Handle all cases where src and dest are aligned on word
  * boundaries. Use unrolled loops for better performance.
  * This option wins over standard large data move when
  * source and destination is in cache for medium
  * to short data moves.
  */
 	cmp %o2, MED_WMAX		/* limit to store buffer size  */
 	bge,pt	%XCC, .Lunalignrejoin	/* otherwise rejoin main loop  */
 	 nop

 	subcc	%o2, 31, %o2		/* adjust length to allow cc test  */
 					/* for end of loop  */
 	ble,pt	%XCC, .Lmedw31		/* skip big loop if less than 16  */
 .Lmedw32:
 	 ld	[%o1], %o4		/* move a block of 32 bytes  */
 	sllx	%o4, 32, %o5
 	ld	[%o1+4], %o4
 	or	%o4, %o5, %o5
 	stx	%o5, [%o0]
 	subcc	%o2, 32, %o2		/* decrement length count  */
 	ld	[%o1+8], %o4
 	sllx	%o4, 32, %o5
 	ld	[%o1+12], %o4
 	or	%o4, %o5, %o5
 	stx	%o5, [%o0+8]
 	add	%o1, 32, %o1		/* increase src ptr by 32  */
 	ld	[%o1-16], %o4
 	sllx	%o4, 32, %o5
 	ld	[%o1-12], %o4
 	or	%o4, %o5, %o5
 	stx	%o5, [%o0+16]
 	add	%o0, 32, %o0		/* increase dst ptr by 32  */
 	ld	[%o1-8], %o4
 	sllx	%o4, 32, %o5
 	ld	[%o1-4], %o4
 	or	%o4, %o5, %o5
 	bgu,pt	%XCC, .Lmedw32		/* repeat if at least 32 bytes left  */
 	 stx	%o5, [%o0-8]
 .Lmedw31:
 	addcc	%o2, 31, %o2		/* restore count  */
 	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 	 cmp	%o2, 16
 	blt,pt	%XCC, .Lmedw15
 	 nop
 	ld	[%o1], %o4		/* move a block of 16 bytes  */
 	sllx	%o4, 32, %o5
 	subcc	%o2, 16, %o2		/* decrement length count  */
 	ld	[%o1+4], %o4
 	or	%o4, %o5, %o5
 	stx	%o5, [%o0]
 	add	%o1, 16, %o1		/* increase src ptr by 16  */
 	ld	[%o1-8], %o4
 	add	%o0, 16, %o0		/* increase dst ptr by 16  */
 	sllx	%o4, 32, %o5
 	ld	[%o1-4], %o4
 	or	%o4, %o5, %o5
 	stx	%o5, [%o0-8]
 .Lmedw15:
 	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 	 cmp	%o2, 8
 	blt,pn	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
 	 tst	%o2
 	ld	[%o1], %o4		/* load 4 bytes  */
 	subcc	%o2, 8, %o2		/* decrease count by 8  */
 	stw	%o4, [%o0]		/* and store 4 bytes  */
 	add	%o1, 8, %o1		/* increase src ptr by 8  */
 	ld	[%o1-4], %o3		/* load 4 bytes  */
 	add	%o0, 8, %o0		/* increase dst ptr by 8  */
 	stw	%o3, [%o0-4]		/* and store 4 bytes  */
 	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
 .Lmedw7:				/* count is ge 1, less than 8  */
 	 cmp	%o2, 4			/* check for 4 bytes left  */
 	blt,pn	%XCC, .Lsmallleft3	/* skip if 3 or fewer bytes left  */
 	 nop
 	ld	[%o1], %o4		/* load 4 bytes  */
 	add	%o1, 4, %o1		/* increase src ptr by 4  */
 	add	%o0, 4, %o0		/* increase dst ptr by 4  */
 	subcc	%o2, 4, %o2		/* decrease count by 4  */
 	bnz,pt	%XCC, .Lsmallleft3
 	 stw	%o4, [%o0-4]		/* and store 4 bytes  */
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 16
 .Llarge_align8_copy:			/* Src and dst 8 byte aligned  */
 	/* align dst to 64 byte boundary  */
 	andcc	%o0, 0x3f, %o3		/* check for dst 64 byte aligned  */
 	brz,pn	%o3, .Laligned_to_64
 	 andcc	%o0, 8, %o3		/* odd long words to move?  */
 	brz,pt	%o3, .Laligned_to_16
 	 nop
 	ldx	[%o1], %o4
 	sub	%o2, 8, %o2
 	add	%o1, 8, %o1		/* increment src ptr  */
 	add	%o0, 8, %o0		/* increment dst ptr  */
 	stx	%o4, [%o0-8]
 .Laligned_to_16:
 	andcc	%o0, 16, %o3		/* pair of long words to move?  */
 	brz,pt	%o3, .Laligned_to_32
 	 nop
 	ldx	[%o1], %o4
 	sub	%o2, 16, %o2
 	stx	%o4, [%o0]
 	add	%o1, 16, %o1		/* increment src ptr  */
 	ldx	[%o1-8], %o4
 	add	%o0, 16, %o0		/* increment dst ptr  */
 	stx	%o4, [%o0-8]
 .Laligned_to_32:
 	andcc	%o0, 32, %o3		/* four long words to move?  */
 	brz,pt	%o3, .Laligned_to_64
 	 nop
 	ldx	[%o1], %o4
 	sub	%o2, 32, %o2
 	stx	%o4, [%o0]
 	ldx	[%o1+8], %o4
 	stx	%o4, [%o0+8]
 	ldx	[%o1+16], %o4
 	stx	%o4, [%o0+16]
 	add	%o1, 32, %o1		/* increment src ptr  */
 	ldx	[%o1-8], %o4
 	add	%o0, 32, %o0		/* increment dst ptr  */
 	stx	%o4, [%o0-8]
 .Laligned_to_64:
 /*	Following test is included to avoid issues where existing executables
  *	incorrectly call memcpy with overlapping src and dest instead of memmove
  *
  *	if ( (src ge dst) and (dst+len > src)) go to overlap case
  *	if ( (src lt dst) and (src+len > dst)) go to overlap case
  */
 	cmp	%o1,%o0
 	bge,pt	%XCC, 1f
 	 nop
 /*				src+len > dst?  */
 	add	%o1, %o2, %o4
 	cmp	%o4, %o0
 	bgt,pt	%XCC, .Lmv_aligned_on_64
 	 nop
 	ba	2f
 	 nop
 1:
 /*				dst+len > src?  */
 	add	%o0, %o2, %o4
 	cmp	%o4, %o1
 	bgt,pt	%XCC, .Lmv_aligned_on_64
 	 nop
 2:
 /*	handle non-overlapped copies
  *
  *	Using block init store (BIS) instructions to avoid fetching cache
  *	lines from memory. Use ST_CHUNK stores to first element of each cache
  *	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
  *	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
  */
 	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */

 /*	We use ASI_STBIMRU_P for the first store to each cache line
  *	followed by ASI_STBI_P (mark as LRU) for the last store. That
  *	mixed approach reduces the chances the cache line is removed
  *	before we finish setting it, while minimizing the effects on
  *	other cached values during a large memcpy
  *
  *	Intermediate stores can be normal since first BIS activates the
  *	cache line in the L2 cache.
  *
  *	ST_CHUNK batches up initial BIS operations for several cache lines
  *	to allow multiple requests to not be blocked by overflowing the
  *	the store miss buffer. Then the matching stores for all those
  *	BIS operations are executed.
  */

 .Lalign_loop:
 	cmp	%o5, ST_CHUNK*64
 	blu,pt	%XCC, .Lalign_short
 	 mov	ST_CHUNK, %o3
 	sllx	%o3, 6, %g5		/* ST_CHUNK*64  */

 .Lalign_loop_start:
 	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
 	subcc	%o3, 2, %o3
 	ldx	[%o1], %o4
 	add	%o1, 128, %o1
 	EX_ST(STORE_ASI(%o4, %o0))
 	add	%o0, 64, %o0
 	ldx	[%o1-64], %o4
 	EX_ST(STORE_ASI(%o4, %o0))
 	add	%o0, 64, %o0
 	bgu,pt	%XCC, .Lalign_loop_start
 	 prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21

 	mov	ST_CHUNK, %o3
 	sub	%o1, %g5, %o1		/* reset %o1  */
 	sub	%o0, %g5, %o0		/* reset %o0  */

 	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
 .Lalign_loop_rest:
 	ldx	[%o1+8],%o4
 	add	%o0, 64, %o0
 	stx	%o4, [%o0-48]
 	subcc	%o3, 1, %o3
 	ldx	[%o1+16],%o4
 	stx	%o4, [%o0-40]
 	sub	%o5, 64, %o5
 	ldx	[%o1+24],%o4
 	stx	%o4, [%o0-32]
 	ldx	[%o1+32],%o4
 	stx	%o4, [%o0-24]
 	ldx	[%o1+40],%o4
 	stx	%o4, [%o0-16]
 	ldx	[%o1+48],%o4
 	stx	%o4, [%o0-8]
 	add	%o1, 64, %o1
 	ldx	[%o1-8],%o4
 	bgu,pt	%XCC, .Lalign_loop_rest
 	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */

 	mov	ST_CHUNK, %o3
 	cmp	%o5, ST_CHUNK*64
 	bgu,pt	%XCC, .Lalign_loop_start
 	 add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */

 	cmp	%o5, 0
 	beq,pt	%XCC, .Lalign_done

 /* no prefetches needed in these loops
  * since we are within ALIGN_PRE of the end */
 .Lalign_short:
 	 srl	%o5, 6, %o3
 .Lalign_loop_short:
 	subcc	%o3, 1, %o3
 	ldx	[%o1], %o4
 	add	%o1, 64, %o1
 	EX_ST(STORE_ASI(%o4, %o0))
 	bgu,pt	%XCC, .Lalign_loop_short
 	 add	%o0, 64, %o0

 	sub	%o1, %o5, %o1		/* reset %o1  */
 	sub	%o0, %o5, %o0		/* reset %o0  */

 	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
 .Lalign_short_rest:
 	ldx	[%o1+8],%o4
 	add	%o0, 64, %o0
 	stx	%o4, [%o0-48]
 	ldx	[%o1+16],%o4
 	subcc	%o5, 64, %o5
 	stx	%o4, [%o0-40]
 	ldx	[%o1+24],%o4
 	stx	%o4, [%o0-32]
 	ldx	[%o1+32],%o4
 	stx	%o4, [%o0-24]
 	ldx	[%o1+40],%o4
 	stx	%o4, [%o0-16]
 	ldx	[%o1+48],%o4
 	stx	%o4, [%o0-8]
 	add	%o1, 64, %o1
 	ldx	[%o1-8],%o4
 	bgu,pt	%XCC, .Lalign_short_rest
 	 EX_ST(STORE_INIT(%o4,%o0))	/* mark cache line as LRU  */

 	add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */

 .Lalign_done:
 	cmp	%o2, 0
 	membar	#StoreStore
 	bne,pt	%XCC, .Lmedl63
 	 subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
 	retl
 	 mov	EX_RETVAL(%g1), %o0	/* restore %o0  */

 	.align 16
 	/* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX  */
 	/* Since block load/store and BIS are not in use for unaligned data,
 	 * no need to align dst on 64 byte cache line boundary  */
 .Lunalignsetup:
 .Lunalignrejoin:
 	rd	%fprs, %g5		/* check for unused fp  */
 	/* if fprs.fef == 0, set it.
 	 * Setting it when already set costs more than checking */
 	andcc	%g5, FPRS_FEF, %g5	/* test FEF, fprs.du = fprs.dl = 0  */
 	bz,a	%XCC, 1f
 	 wr	%g0, FPRS_FEF, %fprs	/* fprs.fef = 1  */
 1:
 	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
 	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
 	cmp	%o2, 8			/* Insure we do not load beyond  */
 	bgt,pt	%XCC, .Lunalign_adjust	/* end of source buffer  */
 	 andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
 	add	%o2, 64, %o2		/* adjust to leave loop  */
 	sub	%o5, 64, %o5		/* early if necessary  */
 .Lunalign_adjust:
 	alignaddr %o1, %g0, %g0		/* generate %gsr  */
 	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
 	ldd	[%o4], %f0
 .Lunalign_loop:
 	prefetch [%o0 + (9 * BLOCK_SIZE)], 20
 	ldd	[%o4+8], %f2
 	faligndata %f0, %f2, %f16
 	ldd	[%o4+16], %f4
 	subcc	%o5, BLOCK_SIZE, %o5
 	std	%f16, [%o0]
 	faligndata %f2, %f4, %f18
 	ldd	[%o4+24], %f6
 	std	%f18, [%o0+8]
 	faligndata %f4, %f6, %f20
 	ldd	[%o4+32], %f8
 	std	%f20, [%o0+16]
 	faligndata %f6, %f8, %f22
 	ldd	[%o4+40], %f10
 	std	%f22, [%o0+24]
 	faligndata %f8, %f10, %f24
 	ldd	[%o4+48], %f12
 	std	%f24, [%o0+32]
 	faligndata %f10, %f12, %f26
 	ldd	[%o4+56], %f14
 	add	%o4, BLOCK_SIZE, %o4
 	std	%f26, [%o0+40]
 	faligndata %f12, %f14, %f28
 	ldd	[%o4], %f0
 	std	%f28, [%o0+48]
 	faligndata %f14, %f0, %f30
 	std	%f30, [%o0+56]
 	add	%o0, BLOCK_SIZE, %o0
 	bgu,pt	%XCC, .Lunalign_loop
 	 prefetch [%o4 + (11 * BLOCK_SIZE)], 20

 	/* Handle trailing bytes, 64 to 127
 	 * Dest long word aligned, Src not long word aligned  */
 	cmp	%o2, 15
 	bleu,pt	%XCC, .Lunalign_short

 	 andn	%o2, 0x7, %o5		/* %o5 is multiple of 8  */
 	and	%o2, 0x7, %o2		/* residue bytes in %o2  */
 	add	%o2, 8, %o2
 	sub	%o5, 8, %o5		/* do not load past end of src  */
 	andn	%o1, 0x7, %o4		/* %o4 has 8 byte aligned src addr  */
 	add	%o1, %o5, %o1		/* move %o1 to after multiple of 8  */
 	ldd	[%o4], %f0		/* fetch partial word  */
 .Lunalign_by8:
 	ldd	[%o4+8], %f2
 	add	%o4, 8, %o4
 	faligndata %f0, %f2, %f16
 	subcc	%o5, 8, %o5
 	std	%f16, [%o0]
 	fsrc2	%f2, %f0
 	bgu,pt	%XCC, .Lunalign_by8
 	 add	%o0, 8, %o0

 .Lunalign_short:			/* restore fprs state */
 	brnz,pt	%g5, .Lsmallrest
 	 nop
 	ba	.Lsmallrest
 	 wr	%g5, %g0, %fprs
 END(__memcpy_niagara7)

 #endif