/*	$NetBSD: lock_stubs_ras.S,v 1.10 2019/04/06 03:06:26 thorpej Exp $	*/

/*-
 * Copyright (c) 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "opt_cputype.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"

#include <sys/errno.h>

#include <machine/asm.h>

RCSID("$NetBSD: lock_stubs_ras.S,v 1.10 2019/04/06 03:06:26 thorpej Exp $")

#include "assym.h"

/*
 * We rely on mips_vector_init to choose to not use these routines if we are
 * on a system with multiple CPUs.  We can still use this on a simple CPU
 * with MULTIPROCESSOR since it might be useful to be using preemption.
 */

/*
 * Lock stubs for non-MP kernels.  These are implemented using restartable
 * sequences, since LL/SC are either not available (MIPS1 and a couple of
 * oddball MIPS3 CPUs) or not desirable (overhead).
 *
 * The order of the generated code is particularly important here.  Some
 * assumptions:
 *
 * o All of the critical sections are 20 bytes in size, and the second
 *   instruction in each critical section is aligned on a 16 byte boundary
 *   (see top of _restart_lock_ras() for why).  The entry is defined here as
 *   the point where a restart occurs if we trap within the section.
 *
 * o The entire code block is aligned on a 256 byte boundary, and is
 *   256 bytes in size.  This is to allow us to do an pessimistic check
 *   after taking a trap with:
 *
 *	if ((addr & ~255) == _lock_ras_start)
 *		addr = _restart_lock_ras(addr);
 *
 *   See definition of MIPS_LOCK_RAS_SIZE in asm.h.
 *
 * o In order to keep the size of the block down, the routines are run
 *   into each other.  Use objdump -d to check alignment after making
 *   changes.
 */
#ifndef __mips_o32
	.set	mips3
#else
	.set	mips1
#endif
	.set	noreorder
	.set	noat

/*
 * to work around the branch prediction engine misbehavior of
 * Loongson 2F processors we need to clear the branch target buffer before
 * a j ra.  This requires extra instructions which don't fit in the RAS blocks,
 * so do a PC-relative just to a block of code (this is the same size as
 * a j ra) where we can let the assembler install the workaround.
 */
#ifdef MIPS3_LOONGSON2F
#define J_RA	j loongson_return
#else
#define J_RA	j ra
#endif


/*
 * unsigned long ras_atomic_cas_ulong(volatile unsigned long *val,
 *     unsigned long old, unsigned long new);
 */
	.text
	.p2align LOG2_MIPS_LOCK_RAS_SIZE

EXPORT(_lock_ras_start)
STATIC_LEAF_NOPROFILE(ras_atomic_cas_noupdate)
	J_RA
	 move	v0, t0
END(ras_atomic_cas_noupdate)

	nop
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bas ras offset"
	.endif
STATIC_LEAF_NOPROFILE(ras_atomic_cas_ulong)
	PTR_L	t0, (a0)	/* <- critical section start */
_atomic_cas_ulong_ras_start:
	 nop
	bne	t0, a1, ras_atomic_cas_noupdate
 	 nop
	PTR_S	a2, (a0)	/* <- critical section end */
_atomic_cas_ulong_ras_end:
	J_RA
	 move	v0, a1
END(ras_atomic_cas_ulong)

/*
 * unsigned int ras_atomic_cas_uint(volatile unsigned int *val,
 *     unsigned int old, unsigned int new);
 */
	nop
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bas ras offset"
	.endif
STATIC_LEAF_NOPROFILE(ras_atomic_cas_uint)
	INT_L	t0, (a0)	/* <- critical section start */
_atomic_cas_uint_ras_start:
	 nop
	bne	t0, a1, ras_atomic_cas_noupdate
 	 nop
	INT_S	a2, (a0)	/* <- critical section end */
_atomic_cas_uint_ras_end:
	J_RA
	 move	v0, a1
END(ras_atomic_cas_uint)

/*
 * int _ucas_32_ras(volatile uint32_t *val, uint32_t old, uint32_t new,
 *     uint32_t *retp);
 */
	nop
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bas ras offset"
	.endif
STATIC_LEAF_NOPROFILE(_ucas_32_ras)
	lw	t0, (a0)	/* <- critical section start */
_ucas_32_ras_start:
	 nop
	bne	t0, a1, _ucas_32_ras_end
 	 nop
	sw	a2, (a0)	/* <- critical section end */
_ucas_32_ras_end:
	PTR_S	zero, PCB_ONFAULT(v1)
	J_RA
	 sw	t0, 0(a3)
END(_ucas_32_ras)

#ifdef _LP64
/*
 * int _ucas_64_ras(volatile uint64_t *val, uint64_t old, uint64_t new,
 *     uint64_t *retp);
 */
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bad ras offset"
	.endif
STATIC_LEAF_NOPROFILE(_ucas_64_ras)
	ld	t0, (a0)	/* <- critical section start */
_ucas_64_ras_start:
	 nop
	bne	t0, a1, _ucas_64_ras_end
 	 nop
	sd	a2, (a0)	/* <- critical section end */
_ucas_64_ras_end:
	PTR_S	zero, PCB_ONFAULT(v1)
	J_RA
	 sd	t0, 0(a3)
END(_ucas_64_ras)
#endif /* _LP64 */

#ifndef LOCKDEBUG
/*
 * void ras_mutex_enter(kmutex_t *mtx);
 */
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bad ras offset"
	.endif
STATIC_LEAF_NOPROFILE(ras_mutex_enter)
	PTR_L	t0, (a0)	/* <- critical section start */
_mutex_enter_ras_start:
	 nop
	bnez	t0, ras_mutex_vector_enter
	 nop
	PTR_S	MIPS_CURLWP, (a0)/* <- critical section end */
_mutex_enter_ras_end:
	J_RA
	 nop
END(ras_mutex_enter)

/*
 * int ras_mutex_exit(kmutex_t *mtx);
 */
	nop
	.if ((. - _lock_ras_start) & 15) != 12
	.error	"bas ras offset"
	.endif
STATIC_LEAF_NOPROFILE(ras_mutex_exit)
	PTR_L	t0, (a0)	/* <- critical section start */
_mutex_exit_ras_start:
	 nop
	bne	t0, MIPS_CURLWP, ras_mutex_vector_exit
	 nop
	PTR_S	zero, (a0)	/* <- critical section end */
_mutex_exit_ras_exit:
	J_RA
	 nop
END(ras_mutex_exit)

/*
 * These could moved out to fit in more RAS sequences.
 */
STATIC_LEAF_NOPROFILE(ras_mutex_vector_enter)
	j	_C_LABEL(mutex_vector_enter)
	 nop
END(ras_mutex_vector_enter)

STATIC_LEAF_NOPROFILE(ras_mutex_vector_exit)
	j	_C_LABEL(mutex_vector_exit)
	 nop
END(ras_mutex_vector_exit)
#endif	/* !LOCKDEBUG */

	.p2align LOG2_MIPS_LOCK_RAS_SIZE	/* Get out of the RAS block */

	.set at
#ifdef MIPS3_LOONGSON2F
loongson_return:
	j	ra
	 nop
#endif

/*
 * Patch up the given address.  We arrive here if we might have trapped
 * within one of the critical sections above.  Do:
 *
 *	if ((addr & ~15) == ras)
 *		return ras - 4;
 *	... check next ...
 *	return addr;
 *
 * Registers on entry:
 *
 *	k1	fault PC
 *	ra	return address
 *
 * On exit:
 *
 *	k1	adjusted fault PC
 *	ra	return address
 *	t0	clobbered
 *	t1	clobbered
 */

#define	RAS_MKMASK(a)	(1 << (((a)-_lock_ras_start) >> 4))

/*
 * Since each RAS is aligned on a 16 byte boundary, we can use its offset
 * from _lock_ras_start to construct a bitmask of the valid RAS within.
 */
#ifndef LOCKDEBUG
#define	MUTEX_RAS_MASK	(RAS_MKMASK(_mutex_enter_ras_start) \
			|RAS_MKMASK(_mutex_exit_ras_start))
#else
#define	MUTEX_RAS_MASK	0
#endif

#ifdef _LP64
#define	UCAS_64_MASK	RAS_MKMASK(_ucas_64_ras_start)
#else
#define	UCAS_64_MASK	0
#endif

#define	RAS_MASK	(RAS_MKMASK(_atomic_cas_ulong_ras_start) \
			|RAS_MKMASK(_atomic_cas_uint_ras_start) \
			|RAS_MKMASK(_ucas_32_ras_start) \
			|UCAS_64_MASK \
			|MUTEX_RAS_MASK)

/*
 * The caller has already determined that
 * _lock_ras_start == (k1 & -MIPS_LOCK_RAS_SIZE)
 */
LEAF_NOPROFILE(_restart_lock_ras)
	and	t0, k1, MIPS_LOCK_RAS_SIZE - 1
				/* look at addr bits in ras region */
	srl	t0, 4		/* focus on each set of 16 bytes */
	li	t1, 1		/* need this to make a bitmask */
	sllv	t1, t1, t0	/* now we have a bitmask of the PC */
	andi	t1, RAS_MASK	/* was the PC in a RAS? */
	bnez	t1, 1f		/* yes, adjust PC */
	 and	t0, k1, 15	/* get offset in RAS */

	j	ra
	 nop
1:
	addu	t0, 4		/* bias offset by one more instruction */
	j	ra
	 PTR_SUBU k1, t0	/* and subtract that from the PC */
END(_restart_lock_ras)

/*
 * int ras_ucas_32(volatile uint32_t *ptr, uint32_t old, uint32_t new,
 *     uint32_t *retp);
 */
STATIC_LEAF(ras_ucas_32)
	PTR_L	v1, L_PCB(MIPS_CURLWP)
	PTR_LA	v0, _C_LABEL(ras_ucaserr)
	PTR_S	v0, PCB_ONFAULT(v1)
	bltz	a0, _C_LABEL(ras_ucaserr)
	 nop
	b	_C_LABEL(_ucas_32_ras)
	 move	v0, zero			# assume success
END(ras_ucas_32)

#ifdef _LP64
/*
 * int ras_ucas_64(volatile uint64_t *ptr, uint64_t old, uint64_t new,
 *     uint64_t *retp);
 */
STATIC_LEAF(ras_ucas_64)
	PTR_L	v1, L_PCB(MIPS_CURLWP)
	PTR_LA	v0, _C_LABEL(ras_ucaserr)
	PTR_S	v0, PCB_ONFAULT(v1)
	bltz	a0, _C_LABEL(ras_ucaserr)
	 nop
	b	_C_LABEL(_ucas_64_ras)
	 move	v0, zero			# assume success
END(ras_ucas_64)
#endif /* _LP64 */

/*
 *
 */
STATIC_LEAF(ras_ucaserr)
	PTR_S	zero, PCB_ONFAULT(v1)		# reset fault handler
	j	ra
	 li	v0, EFAULT			# return EFAULT on error
END(ras_ucaserr)

#ifndef LOCKDEBUG
/*
 * void	mutex_spin_enter(kmutex_t *mtx);
 */
STATIC_NESTED(ras_mutex_spin_enter, CALLFRAME_SIZ, ra)
	move	t0, a0
	PTR_L	t2, L_CPU(MIPS_CURLWP)
	INT_L	a0, MTX_IPL(t0)
#ifdef PARANOIA
	INT_L	ta1, CPU_INFO_CPL(t2)		# get current cpl
#endif

	/*
	 * We need to raise our IPL.
	 * call splraise (only uses a0-a3, v0-v1, and ra)
	 */
	move	t3, ra
	jal	_C_LABEL(splraise)
	 nop
	move	ra, t3

	/*
	 * If this is the first lock of the mutex, store the previous IPL
	 * for exit.
	 */
1:
	INT_L	ta2, CPU_INFO_MTX_COUNT(t2)
	nop
	INT_ADDU ta3, ta2, -1
	INT_S	ta3, CPU_INFO_MTX_COUNT(t2)

	bnez	ta2, 2f
	 nop
	INT_S	v0, CPU_INFO_MTX_OLDSPL(t2)	/* returned by splraise */
2:
#if defined(DIAGNOSTIC)
	INT_L	t3, MTX_LOCK(t0)
	li	t1, 1
	bnez	t3, 3f
	 nop
	j	ra
	 INT_S	t1, MTX_LOCK(t0)
3:
	j	_C_LABEL(mutex_spin_retry)
	 nop
#else	/* DIAGNOSTIC */
	j	ra
	 nop
#endif	/* DIAGNOSTIC */
END(ras_mutex_spin_enter)

/*
 * void	mutex_spin_exit(kmutex_t *mtx);
 */
LEAF(ras_mutex_spin_exit)
	PTR_L	t2, L_CPU(MIPS_CURLWP)
	nop
#if defined(DIAGNOSTIC)
	INT_L	t0, MTX_LOCK(a0)
	nop
	beqz	t0, 2f
	 nop
	INT_S	zero, MTX_LOCK(a0)
#endif

	/*
	 * We need to grab this before the mutex count is incremented
	 * because if we get an interrupt, it may see the count as zero
	 * and overwrite the oldspl value with a bogus value.
	 */
#ifdef PARANOIA
	INT_L	a2, MTX_IPL(a0)
#endif
	INT_L	a0, CPU_INFO_MTX_OLDSPL(t2)

	/*
	 * Increment the mutex count
	 */
	INT_L	t0, CPU_INFO_MTX_COUNT(t2)
	nop
	INT_ADDU t0, t0, 1
	INT_S	t0, CPU_INFO_MTX_COUNT(t2)

	/*
	 * If the IPL doesn't change, nothing to do
	 */
	INT_L	a1, CPU_INFO_CPL(t2)
	nop

#ifdef PARANOIA
	sltu	v0, a1, a2		# v0 = cpl < mtx_ipl
	sltu	v1, a1, a0		# v1 = cpl < oldspl
	sll	v0, 1
	or	v0, v1
12:	bnez	v0, 12b			# loop forever if either is true
	 nop
#endif /* PARANOIA */

	beq	a0, a1, 1f		# if oldspl == cpl
	 nop				#   no reason to drop ipl

	bltz	t0, 1f			# there are still holders
	 nop				# so don't drop IPL

	/*
	 * Mutex count is zero so we need to restore the old IPL
	 */
#ifdef PARANOIA
	sltiu	v0, a0, IPL_HIGH+1
13:	beqz	v0, 13b			# loop forever if ipl > IPL_HIGH
	 nop
#endif
	j	 _C_LABEL(splx)
	 nop
1:
	j	ra
	 nop
#if defined(DIAGNOSTIC)
2:
	j	_C_LABEL(mutex_vector_exit)
	 nop
#endif
END(ras_mutex_spin_exit)
#endif	/* !LOCKDEBUG */

	.data
EXPORT(mips_locore_atomicvec)
	PTR_WORD 	ras_atomic_cas_uint
	PTR_WORD 	ras_atomic_cas_ulong
	PTR_WORD	ras_ucas_32
#ifdef _LP64
	PTR_WORD	ras_ucas_64
#else
	PTR_WORD	0
#endif /* _LP64 */
#ifdef LOCKDEBUG
	PTR_WORD	mutex_enter
	PTR_WORD	mutex_exit
	PTR_WORD	mutex_spin_enter
	PTR_WORD	mutex_spin_exit
#else
	PTR_WORD	ras_mutex_enter
	PTR_WORD	ras_mutex_exit
	PTR_WORD	ras_mutex_spin_enter
	PTR_WORD	ras_mutex_spin_exit
#endif	/* !LOCKDEBUG */