/* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */

/*
 * Copyright (c) 2018 Ryo Shimizu
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

#if defined(LIBC_SCCS)
RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $")
#endif

#if defined(MEMCOPY)

/*
 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
 */
#define FUNCTION		memcpy
#define NO_OVERLAP
#define SRC0			x1
#define DST0			x0
#define LEN			x2

#elif defined(MEMMOVE)

/*
 * void *memmove(void *dst, const void *src, size_t len);
 */
#define FUNCTION		memmove
#undef NO_OVERLAP
#define SRC0			x1
#define DST0			x0
#define LEN			x2

#else /* !MEMCOPY && !MEMMOVE */

/*
 * void bcopy(const void *src, void *dst, size_t len);
 */
#define FUNCTION		bcopy
#define NO_OVERLAP
#define SRC0			x0
#define DST0			x1
#define LEN			x2

#endif /* MEMCOPY/MEMMOVE/BCOPY */

/* caller-saved temporary registers. breakable. */
#define TMP_X			x3
#define TMP_Xw			w3
#define TMP_D			x4
#define TMP_S			x5
#define DST			x6
#define SRC			x7
#define DATA0			x8
#define DATA0w			w8
#define DATA1			x9
#define DATA1w			w9
#define DATA2			x10
#define SRC_ALIGNBIT		x11	/* (SRC & 7) * 8 */
#define DST_ALIGNBIT		x12	/* (DST & 7) * 8 */
#define SRC_DST_ALIGNBIT	x13	/* = SRC_ALIGNBIT - DST_ALIGNBIT */
#define DST_SRC_ALIGNBIT	x14	/* = -SRC_DST_ALIGNBIT */

#define STP_ALIGN		16	/* align before stp/ldp. 8 or 16 */
#define SMALLSIZE		32

	.text
	.align	5

#ifndef NO_OVERLAP
#ifndef STRICT_ALIGNMENT
backward_ignore_align:
	prfm	PLDL1KEEP, [SRC0]
	add	SRC0, SRC0, LEN
	add	DST, DST0, LEN
	cmp	LEN, #SMALLSIZE
	bcs	copy_backward
copy_backward_small:
	cmp	LEN, #8
	bcs	9f

	/* 0 <= len < 8 */
	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
1:
	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
1:
	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
1:
	ret
9:

	cmp	LEN, #16
	bcs	9f

	/* 8 <= len < 16 */
	/* *--(uint64_t *)dst = *--(uint64_t *)src; */
	ldr	TMP_X, [SRC0, #-8]!
	str	TMP_X, [DST, #-8]!
	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
1:
	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
1:
	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
1:
	ret
9:

	/* 16 <= len < 32 */
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0, #-8]!
	str	TMP_X, [DST, #-8]!
1:
	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
1:
	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
1:
	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
1:
	ret
#endif /* !STRICT_ALIGNMENT */

	.align	4
copy_backward:
	/* DST is not aligned at this point */
#ifndef STRICT_ALIGNMENT
	cmp	LEN, #512	/* pre-alignment can be overhead when small */
	bcc	9f
#endif
	/* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	DST, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
	sub	LEN, LEN, #1
1:
	/* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	DST, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
	sub	LEN, LEN, #2
1:
	/* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	DST, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
	sub	LEN, LEN, #4
1:
#if (STP_ALIGN > 8)
	/* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
	tbz	DST, #3, 1f
	ldr	TMP_X, [SRC0, #-8]!
	str	TMP_X, [DST, #-8]!
	sub	LEN, LEN, #8
1:
#endif /* (STP_ALIGN > 8) */
9:

backward_copy1k:
	/* while (len >= 1024) */
	/* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
	cmp	LEN, #1024
	blo	9f
1:
	sub	LEN, LEN, #1024
	.rept	(1024 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!	/* *--dst = *--src; */
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
	cmp	LEN, #1024
	bhs	1b
9:

	/* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
	tbz	LEN, #9, 1f
	.rept	(512 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
1:
	/* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
	tbz	LEN, #8, 1f
	.rept	(256 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
1:
	/* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
	tbz	LEN, #7, 1f
	.rept	(128 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
1:
	/* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
	tbz	LEN, #6, 1f
	.rept	(64 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
1:
	/* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
	tbz	LEN, #5, 1f
	.rept	(32 / 16)
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
	.endr
1:
	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
	tbz	LEN, #4, 1f
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
1:
	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0, #-8]!
	str	TMP_X, [DST, #-8]!
1:
	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
1:
	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
1:
	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
1:
	ret
#endif /* !NO_OVERLAP */


#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
	.align	5
backward_copy:
	prfm	PLDL1KEEP, [SRC0]
	add	DST, DST0, LEN
	add	SRC0, SRC0, LEN
	cmp	LEN, #SMALLSIZE
	bcs	strict_backward

	cmp	LEN, #10
	bcs	9f
backward_tiny:
	/* copy 1-10 bytes */
1:	sub	LEN, LEN, #1
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
	cbz	LEN, 1b
	ret
9:
	/* length is small(<32), and src or dst may be unaligned */
	eor	TMP_X, SRC0, DST
	ands	TMP_X, TMP_X, #7
	bne	notaligned_backward_small

samealign_backward_small:
	/* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	DST, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
	sub	LEN, LEN, #1
1:
	/* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	DST, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
	sub	LEN, LEN, #2
1:
	/* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	DST, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
	sub	LEN, LEN, #4
1:
	/* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
	tbz	LEN, #4, 1f
	ldp	DATA0, DATA1, [SRC0, #-16]!
	stp	DATA0, DATA1, [DST, #-16]!
1:
	/* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0, #-8]!
	str	TMP_X, [DST, #-8]!
1:
	/* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0, #-4]!
	str	TMP_Xw, [DST, #-4]!
1:
	/* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0, #-2]!
	strh	TMP_Xw, [DST, #-2]!
1:
	/* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!
1:
	ret

notaligned_backward_small:
	/* length is small, and src or dst may be unaligned */
	sub	TMP_S, SRC0, LEN	/* tmp_s = src - len */
1:					/* do { */
	ldrb	TMP_Xw, [SRC0, #-1]!
	strb	TMP_Xw, [DST, #-1]!	/*  *(char *)dst++ = *(char *)src++ */
	cmp	TMP_S, SRC0		/* while (tmp_s < src) */
	blo	1b
	ret

strict_backward:
	/* src or dst may be unaligned */
	and	SRC_ALIGNBIT, SRC0, #7
	and	DST_ALIGNBIT, DST, #7
	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
	cbz	SRC_DST_ALIGNBIT, copy_backward	/* same alignment? */

	and	SRC, SRC0, #~7
	and	DST, DST, #~7
	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT

#if BYTE_ORDER == LITTLE_ENDIAN
	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */

	cmp	SRC, SRC0			/* don't access out of range */
	beq	1f
	ldr	DATA1, [SRC]
1:
	ldr	DATA0, [SRC, #-8]!

	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
	lsr	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1<<dst_src_alignbit)| */
	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */

	b	9f				/* }                          */
5:						/* else {                     */
	ldr	DATA0, [SRC]			/*  data0 = *src;             */
	lsr	DATA1, DATA0, SRC_DST_ALIGNBIT	/*  data1=data0>>src_dst_abit;*/
9:						/* }                          */

	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
	mov	TMP_D, DST		/*   tmp_d = dst;                     */

	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
	str	DATA1w, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = data1; */
	lsr	DATA1, DATA1, #32	/*      data1 >>= 32;                 */
1:					/*    }                               */
	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
	strh	DATA1w, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = data1; */
	lsr	DATA1, DATA1, #16	/*      data1 >>= 16;                 */
1:					/*    }                               */
	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
	strb	DATA1w, [TMP_D]		/*      *(uint8_t *)tmp_d = data1;    */
1:					/*    }                               */

	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
9:					/* }                                  */
#else /* BYTE_ORDER */
	tbz	SRC_DST_ALIGNBIT, #63, 5f	/* if(SRC_DST_ALIGNBIT < 0) { */

	cmp	SRC, SRC0			/* don't access out of range */
	beq	1f
	ldr	DATA1, [SRC]
1:
	ldr	DATA0, [SRC, #-8]!

	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT	/* data1 =                    */
	lsl	TMP_X, DATA0, SRC_DST_ALIGNBIT	/* (data1>>dst_src_alignbit)| */
	orr	DATA1, DATA1, TMP_X		/* (data0<<src_dst_alignbit); */

	b	9f				/* }                          */
5:						/* else {                     */
	ldr	DATA0, [SRC]			/*  data0 = *src;             */
	lsr	DATA1, DATA0, DST_SRC_ALIGNBIT	/*  data1=data0<<dst_src_abit;*/
9:						/* }                          */

	cbz	DST_ALIGNBIT, 9f	/* if (dst_alignbit != 0) {           */
	mov	TMP_D, DST		/*   tmp_d = dst;                     */

	tbz	DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
	lsr	TMP_X, DATA1, #32	/*      x = data1 >> 32;              */
	str	TMP_Xw, [TMP_D], #4	/*      *(uint32_t *)tmp_d++ = x;     */
1:					/*    }                               */
	tbz	DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
	lsr	TMP_X, DATA1, #16	/*      x = data1 >> 16;              */
	strh	TMP_Xw, [TMP_D], #2	/*      *(uint16_t *)tmp_d++ = x;     */
1:					/*    }                               */
	tbz	DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
	lsr	TMP_X, DATA1, #8	/*      x = data1 >> 8;               */
	strb	TMP_Xw, [TMP_D], #1	/*      *(uint8_t *)tmp_d++ = x;      */
1:					/*    }                               */

	sub	LEN, LEN, DST_ALIGNBIT, lsr #3	/* len -=(dst_alignbit>>3);   */
9:					/* }                                  */
#endif /* BYTE_ORDER */


backward_shifting_copy_loop:
	ldp	DATA2, DATA1, [SRC, #-16]!
#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
	/* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
	lsl	DATA1, DATA1, DST_SRC_ALIGNBIT
	lsr	TMP_X, DATA2, SRC_DST_ALIGNBIT
	orr	DATA1, DATA1, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
	/* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
	lsr	DATA1, DATA1, DST_SRC_ALIGNBIT
	lsl	TMP_X, DATA2, SRC_DST_ALIGNBIT
	orr	DATA1, DATA1, TMP_X
#endif /* BYTE_ORDER */
	stp	DATA1, DATA0, [DST, #-16]!
	mov	DATA0, DATA2
	sub	LEN, LEN, #16
	cmp	LEN, #16
	bhs	backward_shifting_copy_loop


	/* write 8 bytes */
	tbz	LEN, #3, 9f

	ldr	DATA1, [SRC, #-8]!
#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#endif /* BYTE_ORDER */
	str	DATA0, [DST, #-8]!
	mov	DATA0, DATA1
	sub	LEN, LEN, #8
9:

	cbz	LEN, backward_shifting_copy_done

	/* copy last 1-7 bytes */
	and	TMP_X, SRC_DST_ALIGNBIT, #63
	cmp	LEN, TMP_X, lsr #3
	bls	1f
	ldr	DATA1, [SRC, #-8]!	/* don't access out of range */
1:

#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
	lsl	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsr	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
	lsr	DATA0, DATA0, DST_SRC_ALIGNBIT
	lsl	TMP_X, DATA1, SRC_DST_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#endif /* BYTE_ORDER */

#if BYTE_ORDER == LITTLE_ENDIAN
	tbz	LEN, #2, 1f
	ror	DATA0, DATA0, #32
	str	DATA0w, [DST, #-4]!
1:
	tbz	LEN, #1, 1f
	ror	DATA0, DATA0, #48
	strh	DATA0w, [DST, #-2]!
1:
	tbz	LEN, #0, 1f
	ror	DATA0, DATA0, #56
	strb	DATA0w, [DST, #-1]!
1:
#else /* BYTE_ORDER */
	tbz	LEN, #2, 1f
	str	DATA0w, [DST, #-4]!
	lsr	DATA0, DATA0, #32
1:
	tbz	LEN, #1, 1f
	strh	DATA0w, [DST, #-2]!
	lsr	DATA0, DATA0, #16
1:
	tbz	LEN, #0, 1f
	strb	DATA0w, [DST, #-1]!
1:
#endif /* BYTE_ORDER */
backward_shifting_copy_done:
	ret
#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */


	.align	5
ENTRY(FUNCTION)
#ifdef STRICT_ALIGNMENT
	cbz	LEN, done
#ifndef NO_OVERLAP
	cmp	SRC0, DST0
	beq	done
	bcc	backward_copy
#endif /* NO_OVERLAP */
	mov	DST, DST0
	cmp	LEN, #SMALLSIZE
	bcs	strict_forward

	cmp	LEN, #10
	bcs	9f
forward_tiny:
	/* copy 1-10 bytes */
1:	sub	LEN, LEN, #1
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
	cbz	LEN, 1b
	ret
9:
	/* length is small(<32), and src or dst may be unaligned */
	eor	TMP_X, SRC0, DST0
	ands	TMP_X, TMP_X, #7
	bne	notaligned_forward_small
samealign_forward_small:
	/* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	DST, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
	sub	LEN, LEN, #1
1:
	/* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	DST, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
	sub	LEN, LEN, #2
1:
	/* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	DST, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
	sub	LEN, LEN, #4
1:
	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
	tbz	LEN, #4, 1f
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
1:
	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0], #8
	str	TMP_X, [DST], #8
1:
	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
1:
	ret

notaligned_forward_small:
	/* src and dst are not aligned... */
	prfm	PLDL1KEEP, [SRC0]
	prfm	PLDL1KEEP, [SRC0, #8]
	prfm	PLDL1KEEP, [SRC0, #16]
	add	TMP_S, SRC0, LEN	/* tmp_s = src + len */
1:					/* do { */
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1	/*  *(char *)dst++ = *(char *)src++ */
	cmp	SRC0, TMP_S		/* while (src < tmp_s); */
	blo	1b
	ret

strict_forward:
	/* src or dst may be unaligned */
	and	SRC_ALIGNBIT, SRC0, #7
	and	DST_ALIGNBIT, DST0, #7
	lsl	SRC_ALIGNBIT, SRC_ALIGNBIT, #3
	lsl	DST_ALIGNBIT, DST_ALIGNBIT, #3
	sub	SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
	cbz	SRC_DST_ALIGNBIT, copy_forward	/* same alignment? */

	and	SRC, SRC0, #~7
	and	DST, DST0, #~7
	neg	DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT

#if BYTE_ORDER == LITTLE_ENDIAN
	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
	ldp	DATA1, DATA0, [SRC], #16
	neg	TMP_X, SRC_ALIGNBIT
	lsr	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
	lsl	TMP_X, DATA0, TMP_X		/*  (data1 >> src_alignbit) | */
	orr	DATA1, DATA1, TMP_X		/*  (data0 << -src_alignbit); */
	b	9f
5:
	ldr	DATA0, [SRC], #8
	lsr	DATA1, DATA0, SRC_ALIGNBIT
9:

	cbz	DST_ALIGNBIT, 5f
	mov	TMP_D, DST0
	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
	tbz	TMP_D, #0, 1f
	strb	DATA1w, [TMP_D], #1
	lsr	DATA1, DATA1, #8
1:
	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
	tbz	TMP_D, #1, 1f
	strh	DATA1w, [TMP_D], #2
	lsr	DATA1, DATA1, #16
1:
	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
	tbz	TMP_D, #2, 1f
	str	DATA1w, [TMP_D], #4
1:
	add	DST, DST, #8
	b	9f
5:
	str	DATA1, [DST], #8
9:
	sub	LEN, LEN, #8
	add	LEN, LEN, DST_ALIGNBIT, lsr #3
#else /* BYTE_ORDER */
	tbz	DST_SRC_ALIGNBIT, #63, 5f	/* if(DST_SRC_ALIGNBIT < 0) { */
	ldp	DATA1, DATA0, [SRC], #16
	neg	TMP_X, SRC_ALIGNBIT
	lsl	DATA1, DATA1, SRC_ALIGNBIT	/* data1 =                    */
	lsr	TMP_X, DATA0, TMP_X		/*  (data1 << src_alignbit) | */
	orr	DATA1, DATA1, TMP_X		/*  (data0 >> -src_alignbit); */
	b	9f
5:
	ldr	DATA0, [SRC], #8
	lsl	DATA1, DATA0, SRC_ALIGNBIT
9:

	cbz	DST_ALIGNBIT, 5f
	mov	TMP_D, DST0
	/* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
	tbz	TMP_D, #0, 1f
	lsr	TMP_X, DATA1, #56
	strb	TMP_Xw, [TMP_D], #1
1:
	/* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
	tbz	TMP_D, #1, 1f
	lsr	TMP_X, DATA1, #48
	strh	TMP_Xw, [TMP_D], #2
1:
	/* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
	tbz	TMP_D, #2, 1f
	lsr	TMP_X, DATA1, #32
	str	TMP_Xw, [TMP_D], #4
1:
	add	DST, DST, #8
	b	9f
5:
	str	DATA1, [DST], #8
9:
	sub	LEN, LEN, #8
	add	LEN, LEN, DST_ALIGNBIT, lsr #3
#endif /* BYTE_ORDER */

shifting_copy_loop:
	ldp	DATA1, DATA2, [SRC], #16
#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
	/* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
	lsr	DATA1, DATA1, SRC_DST_ALIGNBIT
	lsl	TMP_X, DATA2, DST_SRC_ALIGNBIT
	orr	DATA1, DATA1, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
	/* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
	lsl	DATA1, DATA1, SRC_DST_ALIGNBIT
	lsr	TMP_X, DATA2, DST_SRC_ALIGNBIT
	orr	DATA1, DATA1, TMP_X
#endif /* BYTE_ORDER */
	stp	DATA0, DATA1, [DST], #16
	mov	DATA0, DATA2
	sub	LEN, LEN, #16
	cmp	LEN, #16
	bhs	shifting_copy_loop


	/* write 8 bytes */
	tbz	LEN, #3, 9f
	ldr	DATA1, [SRC], #8
#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#endif /* BYTE_ORDER */
	str	DATA0, [DST], #8
	mov	DATA0, DATA1
	sub	LEN, LEN, #8
9:

	cbz	LEN, shifting_copy_done

	/* copy last 1-7 bytes */
	and	TMP_X, DST_SRC_ALIGNBIT, #63
	cmp	LEN, TMP_X, lsr #3
	bls	1f
	ldr	DATA1, [SRC], #8	/* don't access out of range */
1:

#if BYTE_ORDER == LITTLE_ENDIAN
	/* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
	lsr	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsl	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#else /* BYTE_ORDER */
	/* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
	lsl	DATA0, DATA0, SRC_DST_ALIGNBIT
	lsr	TMP_X, DATA1, DST_SRC_ALIGNBIT
	orr	DATA0, DATA0, TMP_X
#endif /* BYTE_ORDER */

#if BYTE_ORDER == LITTLE_ENDIAN
	/* if (len & 4) { *(uint32_t *)dst++ = data0; } */
	tbz	LEN, #2, 1f
	str	DATA0w, [DST], #4
	lsr	DATA0, DATA0, #32
1:
	/* if (len & 2) { *(uint16_t *)dst++ = data0; } */
	tbz	LEN, #1, 1f
	strh	DATA0w, [DST], #2
	lsr	DATA0, DATA0, #16
1:
	/* if (len & 1) { *(uint8_t *)dst++ = data0; } */
	tbz	LEN, #0, 1f
	strb	DATA0w, [DST], #1
1:
#else /* BYTE_ORDER */
	/* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
	tbz	LEN, #2, 1f
	lsr	TMP_X, DATA0, #32
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
	tbz	LEN, #1, 1f
	lsr	TMP_X, DATA0, #16
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
	tbz	LEN, #0, 1f
	lsr	TMP_X, DATA0, #8
	strb	TMP_Xw, [DST], #1
1:
#endif /* BYTE_ORDER */
shifting_copy_done:
	ret

#else /* STRICT_ALIGNMENT */
#ifndef NO_OVERLAP
	cbz	LEN, done
	cmp	SRC0, DST0
	beq	done
	bcc	backward_ignore_align
#endif /* NO_OVERLAP */

	prfm	PLDL1KEEP, [SRC0]
	cmp	LEN, #SMALLSIZE
	bcs	copy_forward
	mov	DST, DST0

copy_forward_small:
	cmp	LEN, #8
	bcs	9f

	/* 0 <= len < 8 */
	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
1:
	ret
9:

	prfm	PLDL1KEEP, [SRC0, #8]
	cmp	LEN, #16
	bcs	9f

	/* 8 <= len < 16 */
	/* *(uint64_t *)dst++ = *(uint64_t *)src++; */
	ldr	TMP_X, [SRC0], #8
	str	TMP_X, [DST], #8
	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
1:
	ret
9:

	/* 16 <= len < 32 */
	prfm	PLDL1KEEP, [SRC0, 16]
	prfm	PLDL1KEEP, [SRC0, 24]
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0], #8
	str	TMP_X, [DST], #8
1:
	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
1:
	ret
#endif /* !STRICT_ALIGNMENT */

	.align	4
copy_forward:
	/* DST is not aligned at this point */
	mov	DST, DST0
#ifndef STRICT_ALIGNMENT
	cmp	LEN, #512	/* pre-alignment can be overhead when small */
	bcc	9f
#endif /* STRICT_ALIGNMENT */
	/* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	DST, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
	sub	LEN, LEN, #1
1:
	/* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	DST, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
	sub	LEN, LEN, #2
1:
	/* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	DST, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
	sub	LEN, LEN, #4
1:
#if (STP_ALIGN > 8)
	/* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
	tbz	DST, #3, 1f
	ldr	TMP_X, [SRC0], #8
	str	TMP_X, [DST], #8
	sub	LEN, LEN, #8
1:
#endif /* (STP_ALIGN > 8) */
9:

forward_copy1k:
	/* while (len >= 1024) */
	/* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
	cmp	LEN, #1024
	blo	9f
1:
	sub	LEN, LEN, #1024
	.rept	(1024 / 16)
	ldp	DATA0, DATA1, [SRC0], #16	/* *dst++ = *src++; */
	stp	DATA0, DATA1, [DST], #16
	.endr
	cmp	LEN, #1024
	bhs	1b
9:

	/* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
	tbz	LEN, #9, 1f
	.rept	(512 / 16)
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	.endr
1:
	/* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
	tbz	LEN, #8, 1f
	.rept	(256 / 16)
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	.endr
1:
	/* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
	tbz	LEN, #7, 1f
	.rept	(128 / 16)
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	.endr
1:
	/* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
	tbz	LEN, #6, 1f
	.rept	(64 / 16)
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	.endr
1:
	/* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
	tbz	LEN, #5, 1f
	.rept	(32 / 16)
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
	.endr
1:
	/* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
	tbz	LEN, #4, 1f
	ldp	DATA0, DATA1, [SRC0], #16
	stp	DATA0, DATA1, [DST], #16
1:
	/* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
	tbz	LEN, #3, 1f
	ldr	TMP_X, [SRC0], #8
	str	TMP_X, [DST], #8
1:
	/* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
	tbz	LEN, #2, 1f
	ldr	TMP_Xw, [SRC0], #4
	str	TMP_Xw, [DST], #4
1:
	/* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
	tbz	LEN, #1, 1f
	ldrh	TMP_Xw, [SRC0], #2
	strh	TMP_Xw, [DST], #2
1:
	/* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
	tbz	LEN, #0, 1f
	ldrb	TMP_Xw, [SRC0], #1
	strb	TMP_Xw, [DST], #1
1:
done:
	ret
END(FUNCTION)