#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"

#ifdef  __arch64__
.register	%g2,#scratch
.register	%g3,#scratch
#endif

.section	".text",#alloc,#execinstr

.align	64
rem_4bit:
	.long	0,0,471859200,0,943718400,0,610271232,0
	.long	1887436800,0,1822425088,0,1220542464,0,1423966208,0
	.long	3774873600,0,4246732800,0,3644850176,0,3311403008,0
	.long	2441084928,0,2376073216,0,2847932416,0,3051356160,0
.type	rem_4bit,#object
.size	rem_4bit,(.-rem_4bit)

.globl	gcm_ghash_4bit
.align	32
gcm_ghash_4bit:
	save	%sp,-STACK_FRAME,%sp
	ldub	[%i2+15],%l1
	ldub	[%i0+15],%l2
	ldub	[%i0+14],%l3
	add	%i3,%i2,%i3
	add	%i1,8,%l6

1:	call	.+8
	add	%o7,rem_4bit-1b,%l4

.Louter:
	xor	%l2,%l1,%l1
	and	%l1,0xf0,%l0
	and	%l1,0x0f,%l1
	sll	%l1,4,%l1
	ldx	[%l6+%l1],%o1
	ldx	[%i1+%l1],%o0

	ldub	[%i2+14],%l1

	ldx	[%l6+%l0],%o3
	and	%o1,0xf,%l5
	ldx	[%i1+%l0],%o2
	sll	%l5,3,%l5
	ldx	[%l4+%l5],%o4
	srlx	%o1,4,%o1
	mov	13,%l7
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1

	xor	%l3,%l1,%l1
	and	%o1,0xf,%l5
	and	%l1,0xf0,%l0
	and	%l1,0x0f,%l1
	ba	.Lghash_inner
	sll	%l1,4,%l1
.align	32
.Lghash_inner:
	ldx	[%l6+%l1],%o3
	sll	%l5,3,%l5
	xor	%o2,%o0,%o0
	ldx	[%i1+%l1],%o2
	srlx	%o1,4,%o1
	xor	%o4,%o0,%o0
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	ldub	[%i2+%l7],%l1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	ldub	[%i0+%l7],%l3
	xor	%o2,%o0,%o0
	and	%o1,0xf,%l5

	ldx	[%l6+%l0],%o3
	sll	%l5,3,%l5
	xor	%o4,%o0,%o0
	ldx	[%i1+%l0],%o2
	srlx	%o1,4,%o1
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%l3,%l1,%l1
	srlx	%o0,4,%o0
	and	%l1,0xf0,%l0
	addcc	%l7,-1,%l7
	xor	%o1,%o5,%o1
	and	%l1,0x0f,%l1
	xor	%o3,%o1,%o1
	sll	%l1,4,%l1
	blu	.Lghash_inner
	and	%o1,0xf,%l5

	ldx	[%l6+%l1],%o3
	sll	%l5,3,%l5
	xor	%o2,%o0,%o0
	ldx	[%i1+%l1],%o2
	srlx	%o1,4,%o1
	xor	%o4,%o0,%o0
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0

	add	%i2,16,%i2
	cmp	%i2,%i3
	be,pn	SIZE_T_CC,.Ldone
	and	%o1,0xf,%l5

	ldx	[%l6+%l0],%o3
	sll	%l5,3,%l5
	xor	%o4,%o0,%o0
	ldx	[%i1+%l0],%o2
	srlx	%o1,4,%o1
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	ldub	[%i2+15],%l1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0
	stx	%o1,[%i0+8]
	xor	%o4,%o0,%o0
	stx	%o0,[%i0]
	srl	%o1,8,%l3
	and	%o1,0xff,%l2
	ba	.Louter
	and	%l3,0xff,%l3
.align	32
.Ldone:
	ldx	[%l6+%l0],%o3
	sll	%l5,3,%l5
	xor	%o4,%o0,%o0
	ldx	[%i1+%l0],%o2
	srlx	%o1,4,%o1
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0
	stx	%o1,[%i0+8]
	xor	%o4,%o0,%o0
	stx	%o0,[%i0]

	ret
	restore
.type	gcm_ghash_4bit,#function
.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
.globl	gcm_gmult_4bit
.align	32
gcm_gmult_4bit:
	save	%sp,-STACK_FRAME,%sp
	ldub	[%i0+15],%l1
	add	%i1,8,%l6

1:	call	.+8
	add	%o7,rem_4bit-1b,%l4

	and	%l1,0xf0,%l0
	and	%l1,0x0f,%l1
	sll	%l1,4,%l1
	ldx	[%l6+%l1],%o1
	ldx	[%i1+%l1],%o0

	ldub	[%i0+14],%l1

	ldx	[%l6+%l0],%o3
	and	%o1,0xf,%l5
	ldx	[%i1+%l0],%o2
	sll	%l5,3,%l5
	ldx	[%l4+%l5],%o4
	srlx	%o1,4,%o1
	mov	13,%l7
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1

	and	%o1,0xf,%l5
	and	%l1,0xf0,%l0
	and	%l1,0x0f,%l1
	ba	.Lgmult_inner
	sll	%l1,4,%l1
.align	32
.Lgmult_inner:
	ldx	[%l6+%l1],%o3
	sll	%l5,3,%l5
	xor	%o2,%o0,%o0
	ldx	[%i1+%l1],%o2
	srlx	%o1,4,%o1
	xor	%o4,%o0,%o0
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	ldub	[%i0+%l7],%l1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0
	and	%o1,0xf,%l5

	ldx	[%l6+%l0],%o3
	sll	%l5,3,%l5
	xor	%o4,%o0,%o0
	ldx	[%i1+%l0],%o2
	srlx	%o1,4,%o1
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	srlx	%o0,4,%o0
	and	%l1,0xf0,%l0
	addcc	%l7,-1,%l7
	xor	%o1,%o5,%o1
	and	%l1,0x0f,%l1
	xor	%o3,%o1,%o1
	sll	%l1,4,%l1
	blu	.Lgmult_inner
	and	%o1,0xf,%l5

	ldx	[%l6+%l1],%o3
	sll	%l5,3,%l5
	xor	%o2,%o0,%o0
	ldx	[%i1+%l1],%o2
	srlx	%o1,4,%o1
	xor	%o4,%o0,%o0
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0
	and	%o1,0xf,%l5

	ldx	[%l6+%l0],%o3
	sll	%l5,3,%l5
	xor	%o4,%o0,%o0
	ldx	[%i1+%l0],%o2
	srlx	%o1,4,%o1
	ldx	[%l4+%l5],%o4
	sllx	%o0,60,%o5
	xor	%o3,%o1,%o1
	srlx	%o0,4,%o0
	xor	%o1,%o5,%o1
	xor	%o2,%o0,%o0
	stx	%o1,[%i0+8]
	xor	%o4,%o0,%o0
	stx	%o0,[%i0]

	ret
	restore
.type	gcm_gmult_4bit,#function
.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
.globl	gcm_init_vis3
.align	32
gcm_init_vis3:
	save	%sp,-STACK_FRAME,%sp

	ldx	[%i1+0],%o2
	ldx	[%i1+8],%o1
	mov	0xE1,%o4
	mov	1,%o3
	sllx	%o4,57,%o4
	srax	%o2,63,%g1		! broadcast carry
	addcc	%o1,%o1,%o1		! H<<=1
	.word	0x95b2822a !addxc	%o2,%o2,%o2
	and	%g1,%o3,%o3
	and	%g1,%o4,%o4
	xor	%o3,%o1,%o1
	xor	%o4,%o2,%o2
	stx	%o1,[%i0+8]		! save twisted H
	stx	%o2,[%i0+0]

	sethi	%hi(0xA0406080),%g5
	sethi	%hi(0x20C0E000),%l0
	or	%g5,%lo(0xA0406080),%g5
	or	%l0,%lo(0x20C0E000),%l0
	sllx	%g5,32,%g5
	or	%l0,%g5,%g5		! (0xE0·i)&0xff=0xA040608020C0E000
	stx	%g5,[%i0+16]

	ret
	restore
.type	gcm_init_vis3,#function
.size	gcm_init_vis3,.-gcm_init_vis3

.globl	gcm_gmult_vis3
.align	32
gcm_gmult_vis3:
	save	%sp,-STACK_FRAME,%sp

	ldx	[%i0+8],%o3		! load Xi
	ldx	[%i0+0],%o4
	ldx	[%i1+8],%o1	! load twisted H
	ldx	[%i1+0],%o2

	mov	0xE1,%l7
	sllx	%l7,57,%o5		! 57 is not a typo
	ldx	[%i1+16],%g5		! (0xE0·i)&0xff=0xA040608020C0E000

	xor	%o2,%o1,%o0		! Karatsuba pre-processing
	.word	0x83b2e2a9 !xmulx	%o3,%o1,%g1
	xor	%o3,%o4,%g3		! Karatsuba pre-processing
	.word	0x85b0e2a8 !xmulx	%g3,%o0,%g2
	.word	0x97b2e2c9 !xmulxhi	%o3,%o1,%o3
	.word	0x87b0e2c8 !xmulxhi	%g3,%o0,%g3
	.word	0x89b322ca !xmulxhi	%o4,%o2,%g4
	.word	0x99b322aa !xmulx	%o4,%o2,%o4

	sll	%g1,3,%o7
	srlx	%g5,%o7,%o7		! ·0xE0 [implicit &(7<<3)]
	xor	%g1,%o7,%o7
	sllx	%o7,57,%o7		! (%g1·0xE1)<<1<<56 [implicit &0x7f]

	xor	%g1,%g2,%g2		! Karatsuba post-processing
	xor	%o3,%g3,%g3
	 xor	%o7,%o3,%o3		! real destination is %g2
	xor	%g4,%g3,%g3
	xor	%o3,%g2,%g2
	xor	%o4,%g3,%g3
	xor	%o4,%g2,%g2

	.word	0x97b062cd !xmulxhi	%g1,%o5,%o3		! ·0xE1<<1<<56
	 xor	%g1,%g3,%g3
	.word	0x83b0a2ad !xmulx	%g2,%o5,%g1
	 xor	%g2,%g4,%g4
	.word	0x85b0a2cd !xmulxhi	%g2,%o5,%g2

	xor	%o3,%g3,%g3
	xor	%g1,%g3,%g3
	xor	%g2,%g4,%g4

	stx	%g3,[%i0+8]		! save Xi
	stx	%g4,[%i0+0]

	ret
	restore
.type	gcm_gmult_vis3,#function
.size	gcm_gmult_vis3,.-gcm_gmult_vis3

.globl	gcm_ghash_vis3
.align	32
gcm_ghash_vis3:
	save	%sp,-STACK_FRAME,%sp
	nop
	srln	%i3,0,%i3		! needed on v8+, "nop" on v9

	ldx	[%i0+8],%g3		! load Xi
	ldx	[%i0+0],%g4
	ldx	[%i1+8],%o1	! load twisted H
	ldx	[%i1+0],%o2

	mov	0xE1,%l7
	sllx	%l7,57,%o5		! 57 is not a typo
	ldx	[%i1+16],%g5		! (0xE0·i)&0xff=0xA040608020C0E000

	and	%i2,7,%l0
	andn	%i2,7,%i2
	sll	%l0,3,%l0
	prefetch [%i2+63], 20
	sub	%g0,%l0,%l1

	xor	%o2,%o1,%o0		! Karatsuba pre-processing
.Loop:
	ldx	[%i2+8],%o3
	brz,pt	%l0,1f
	ldx	[%i2+0],%o4

	ldx	[%i2+16],%g2		! align data
	srlx	%o3,%l1,%g1
	sllx	%o3,%l0,%o3
	sllx	%o4,%l0,%o4
	srlx	%g2,%l1,%g2
	or	%g1,%o4,%o4
	or	%g2,%o3,%o3
1:
	add	%i2,16,%i2
	sub	%i3,16,%i3
	xor	%g3,%o3,%o3
	xor	%g4,%o4,%o4
	prefetch [%i2+63], 20

	.word	0x83b2e2a9 !xmulx	%o3,%o1,%g1
	xor	%o3,%o4,%g3		! Karatsuba pre-processing
	.word	0x85b0e2a8 !xmulx	%g3,%o0,%g2
	.word	0x97b2e2c9 !xmulxhi	%o3,%o1,%o3
	.word	0x87b0e2c8 !xmulxhi	%g3,%o0,%g3
	.word	0x89b322ca !xmulxhi	%o4,%o2,%g4
	.word	0x99b322aa !xmulx	%o4,%o2,%o4

	sll	%g1,3,%o7
	srlx	%g5,%o7,%o7		! ·0xE0 [implicit &(7<<3)]
	xor	%g1,%o7,%o7
	sllx	%o7,57,%o7		! (%g1·0xE1)<<1<<56 [implicit &0x7f]

	xor	%g1,%g2,%g2		! Karatsuba post-processing
	xor	%o3,%g3,%g3
	 xor	%o7,%o3,%o3		! real destination is %g2
	xor	%g4,%g3,%g3
	xor	%o3,%g2,%g2
	xor	%o4,%g3,%g3
	xor	%o4,%g2,%g2

	.word	0x97b062cd !xmulxhi	%g1,%o5,%o3		! ·0xE1<<1<<56
	 xor	%g1,%g3,%g3
	.word	0x83b0a2ad !xmulx	%g2,%o5,%g1
	 xor	%g2,%g4,%g4
	.word	0x85b0a2cd !xmulxhi	%g2,%o5,%g2

	xor	%o3,%g3,%g3
	xor	%g1,%g3,%g3
	brnz,pt	%i3,.Loop
	xor	%g2,%g4,%g4

	stx	%g3,[%i0+8]		! save Xi
	stx	%g4,[%i0+0]

	ret
	restore
.type	gcm_ghash_vis3,#function
.size	gcm_ghash_vis3,.-gcm_ghash_vis3
.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro@openssl.org>"
.align	4