#include "arm_asm.h"
#ifndef	__KERNEL__
# include "arm_arch.h"

.hidden	OPENSSL_armcap_P
#endif

.text

.align	5
.Lsigma:
.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
.Lone:
.long	1,2,3,4
.Lrot24:
.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
.align	2

.globl	ChaCha20_ctr32
.type	ChaCha20_ctr32,%function
.align	5
ChaCha20_ctr32:
	cbz	x2,.Labort
	cmp	x2,#192
	b.lo	.Lshort

#ifndef	__KERNEL__
	adrp	x17,OPENSSL_armcap_P
	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
	tst	w17,#ARMV7_NEON
	b.ne	.LChaCha20_neon
#endif

.Lshort:
.inst	0xd503233f			// paciasp
	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0

	adr	x5,.Lsigma
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	sub	sp,sp,#64

	ldp	x22,x23,[x5]		// load sigma
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ldp	x28,x30,[x4]		// load counter
#ifdef	__AARCH64EB__
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif

.Loop_outer:
	mov	w5,w22			// unpack key block
	lsr	x6,x22,#32
	mov	w7,w23
	lsr	x8,x23,#32
	mov	w9,w24
	lsr	x10,x24,#32
	mov	w11,w25
	lsr	x12,x25,#32
	mov	w13,w26
	lsr	x14,x26,#32
	mov	w15,w27
	lsr	x16,x27,#32
	mov	w17,w28
	lsr	x19,x28,#32
	mov	w20,w30
	lsr	x21,x30,#32

	mov	x4,#10
	subs	x2,x2,#64
.Loop:
	sub	x4,x4,#1
	add	w5,w5,w9
	add	w6,w6,w10
	add	w7,w7,w11
	add	w8,w8,w12
	eor	w17,w17,w5
	eor	w19,w19,w6
	eor	w20,w20,w7
	eor	w21,w21,w8
	ror	w17,w17,#16
	ror	w19,w19,#16
	ror	w20,w20,#16
	ror	w21,w21,#16
	add	w13,w13,w17
	add	w14,w14,w19
	add	w15,w15,w20
	add	w16,w16,w21
	eor	w9,w9,w13
	eor	w10,w10,w14
	eor	w11,w11,w15
	eor	w12,w12,w16
	ror	w9,w9,#20
	ror	w10,w10,#20
	ror	w11,w11,#20
	ror	w12,w12,#20
	add	w5,w5,w9
	add	w6,w6,w10
	add	w7,w7,w11
	add	w8,w8,w12
	eor	w17,w17,w5
	eor	w19,w19,w6
	eor	w20,w20,w7
	eor	w21,w21,w8
	ror	w17,w17,#24
	ror	w19,w19,#24
	ror	w20,w20,#24
	ror	w21,w21,#24
	add	w13,w13,w17
	add	w14,w14,w19
	add	w15,w15,w20
	add	w16,w16,w21
	eor	w9,w9,w13
	eor	w10,w10,w14
	eor	w11,w11,w15
	eor	w12,w12,w16
	ror	w9,w9,#25
	ror	w10,w10,#25
	ror	w11,w11,#25
	ror	w12,w12,#25
	add	w5,w5,w10
	add	w6,w6,w11
	add	w7,w7,w12
	add	w8,w8,w9
	eor	w21,w21,w5
	eor	w17,w17,w6
	eor	w19,w19,w7
	eor	w20,w20,w8
	ror	w21,w21,#16
	ror	w17,w17,#16
	ror	w19,w19,#16
	ror	w20,w20,#16
	add	w15,w15,w21
	add	w16,w16,w17
	add	w13,w13,w19
	add	w14,w14,w20
	eor	w10,w10,w15
	eor	w11,w11,w16
	eor	w12,w12,w13
	eor	w9,w9,w14
	ror	w10,w10,#20
	ror	w11,w11,#20
	ror	w12,w12,#20
	ror	w9,w9,#20
	add	w5,w5,w10
	add	w6,w6,w11
	add	w7,w7,w12
	add	w8,w8,w9
	eor	w21,w21,w5
	eor	w17,w17,w6
	eor	w19,w19,w7
	eor	w20,w20,w8
	ror	w21,w21,#24
	ror	w17,w17,#24
	ror	w19,w19,#24
	ror	w20,w20,#24
	add	w15,w15,w21
	add	w16,w16,w17
	add	w13,w13,w19
	add	w14,w14,w20
	eor	w10,w10,w15
	eor	w11,w11,w16
	eor	w12,w12,w13
	eor	w9,w9,w14
	ror	w10,w10,#25
	ror	w11,w11,#25
	ror	w12,w12,#25
	ror	w9,w9,#25
	cbnz	x4,.Loop

	add	w5,w5,w22		// accumulate key block
	add	x6,x6,x22,lsr#32
	add	w7,w7,w23
	add	x8,x8,x23,lsr#32
	add	w9,w9,w24
	add	x10,x10,x24,lsr#32
	add	w11,w11,w25
	add	x12,x12,x25,lsr#32
	add	w13,w13,w26
	add	x14,x14,x26,lsr#32
	add	w15,w15,w27
	add	x16,x16,x27,lsr#32
	add	w17,w17,w28
	add	x19,x19,x28,lsr#32
	add	w20,w20,w30
	add	x21,x21,x30,lsr#32

	b.lo	.Ltail

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#1			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64

	b.hi	.Loop_outer

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.inst	0xd50323bf			// autiasp
.Labort:
	ret

.align	4
.Ltail:
	add	x2,x2,#64
.Less_than_64:
	sub	x0,x0,#1
	add	x1,x1,x2
	add	x0,x0,x2
	add	x4,sp,x2
	neg	x2,x2

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	stp	x5,x7,[sp,#0]
	stp	x9,x11,[sp,#16]
	stp	x13,x15,[sp,#32]
	stp	x17,x20,[sp,#48]

.Loop_tail:
	ldrb	w10,[x1,x2]
	ldrb	w11,[x4,x2]
	add	x2,x2,#1
	eor	w10,w10,w11
	strb	w10,[x0,x2]
	cbnz	x2,.Loop_tail

	stp	xzr,xzr,[sp,#0]
	stp	xzr,xzr,[sp,#16]
	stp	xzr,xzr,[sp,#32]
	stp	xzr,xzr,[sp,#48]

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.inst	0xd50323bf			// autiasp
	ret
.size	ChaCha20_ctr32,.-ChaCha20_ctr32

#ifdef	__KERNEL__
.globl	ChaCha20_neon
#endif
.type	ChaCha20_neon,%function
.align	5
ChaCha20_neon:
.LChaCha20_neon:
.inst	0xd503233f			// paciasp
	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0

	adr	x5,.Lsigma
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]
	cmp	x2,#512
	b.hs	.L512_or_more_neon

	sub	sp,sp,#64

	ldp	x22,x23,[x5]		// load sigma
	ld1	{v0.4s},[x5],#16
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ld1	{v1.4s,v2.4s},[x3]
	ldp	x28,x30,[x4]		// load counter
	ld1	{v3.4s},[x4]
	stp	d8,d9,[sp]			// meet ABI requirements
	ld1	{v8.4s,v9.4s},[x5]
#ifdef	__AARCH64EB__
	rev64	v0.4s,v0.4s
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif

.Loop_outer_neon:
	dup	v16.4s,v0.s[0]			// unpack key block
	mov	w5,w22
	dup	v20.4s,v0.s[1]
	lsr	x6,x22,#32
	dup	v24.4s,v0.s[2]
	mov	w7,w23
	dup	v28.4s,v0.s[3]
	lsr	x8,x23,#32
	dup	v17.4s,v1.s[0]
	mov	w9,w24
	dup	v21.4s,v1.s[1]
	lsr	x10,x24,#32
	dup	v25.4s,v1.s[2]
	mov	w11,w25
	dup	v29.4s,v1.s[3]
	lsr	x12,x25,#32
	dup	v19.4s,v3.s[0]
	mov	w13,w26
	dup	v23.4s,v3.s[1]
	lsr	x14,x26,#32
	dup	v27.4s,v3.s[2]
	mov	w15,w27
	dup	v31.4s,v3.s[3]
	lsr	x16,x27,#32
	add	v19.4s,v19.4s,v8.4s
	mov	w17,w28
	dup	v18.4s,v2.s[0]
	lsr	x19,x28,#32
	dup	v22.4s,v2.s[1]
	mov	w20,w30
	dup	v26.4s,v2.s[2]
	lsr	x21,x30,#32
	dup	v30.4s,v2.s[3]

	mov	x4,#10
	subs	x2,x2,#320
.Loop_neon:
	sub	x4,x4,#1
	add	v16.4s,v16.4s,v17.4s
	add	w5,w5,w9
	add	v20.4s,v20.4s,v21.4s
	add	w6,w6,w10
	add	v24.4s,v24.4s,v25.4s
	add	w7,w7,w11
	add	v28.4s,v28.4s,v29.4s
	add	w8,w8,w12
	eor	v19.16b,v19.16b,v16.16b
	eor	w17,w17,w5
	eor	v23.16b,v23.16b,v20.16b
	eor	w19,w19,w6
	eor	v27.16b,v27.16b,v24.16b
	eor	w20,w20,w7
	eor	v31.16b,v31.16b,v28.16b
	eor	w21,w21,w8
	rev32	v19.8h,v19.8h
	ror	w17,w17,#16
	rev32	v23.8h,v23.8h
	ror	w19,w19,#16
	rev32	v27.8h,v27.8h
	ror	w20,w20,#16
	rev32	v31.8h,v31.8h
	ror	w21,w21,#16
	add	v18.4s,v18.4s,v19.4s
	add	w13,w13,w17
	add	v22.4s,v22.4s,v23.4s
	add	w14,w14,w19
	add	v26.4s,v26.4s,v27.4s
	add	w15,w15,w20
	add	v30.4s,v30.4s,v31.4s
	add	w16,w16,w21
	eor	v4.16b,v17.16b,v18.16b
	eor	w9,w9,w13
	eor	v5.16b,v21.16b,v22.16b
	eor	w10,w10,w14
	eor	v6.16b,v25.16b,v26.16b
	eor	w11,w11,w15
	eor	v7.16b,v29.16b,v30.16b
	eor	w12,w12,w16
	ushr	v17.4s,v4.4s,#20
	ror	w9,w9,#20
	ushr	v21.4s,v5.4s,#20
	ror	w10,w10,#20
	ushr	v25.4s,v6.4s,#20
	ror	w11,w11,#20
	ushr	v29.4s,v7.4s,#20
	ror	w12,w12,#20
	sli	v17.4s,v4.4s,#12
	add	w5,w5,w9
	sli	v21.4s,v5.4s,#12
	add	w6,w6,w10
	sli	v25.4s,v6.4s,#12
	add	w7,w7,w11
	sli	v29.4s,v7.4s,#12
	add	w8,w8,w12
	add	v16.4s,v16.4s,v17.4s
	eor	w17,w17,w5
	add	v20.4s,v20.4s,v21.4s
	eor	w19,w19,w6
	add	v24.4s,v24.4s,v25.4s
	eor	w20,w20,w7
	add	v28.4s,v28.4s,v29.4s
	eor	w21,w21,w8
	eor	v4.16b,v19.16b,v16.16b
	ror	w17,w17,#24
	eor	v5.16b,v23.16b,v20.16b
	ror	w19,w19,#24
	eor	v6.16b,v27.16b,v24.16b
	ror	w20,w20,#24
	eor	v7.16b,v31.16b,v28.16b
	ror	w21,w21,#24
	tbl	v19.16b,{v4.16b},v9.16b
	add	w13,w13,w17
	tbl	v23.16b,{v5.16b},v9.16b
	add	w14,w14,w19
	tbl	v27.16b,{v6.16b},v9.16b
	add	w15,w15,w20
	tbl	v31.16b,{v7.16b},v9.16b
	add	w16,w16,w21
	add	v18.4s,v18.4s,v19.4s
	eor	w9,w9,w13
	add	v22.4s,v22.4s,v23.4s
	eor	w10,w10,w14
	add	v26.4s,v26.4s,v27.4s
	eor	w11,w11,w15
	add	v30.4s,v30.4s,v31.4s
	eor	w12,w12,w16
	eor	v4.16b,v17.16b,v18.16b
	ror	w9,w9,#25
	eor	v5.16b,v21.16b,v22.16b
	ror	w10,w10,#25
	eor	v6.16b,v25.16b,v26.16b
	ror	w11,w11,#25
	eor	v7.16b,v29.16b,v30.16b
	ror	w12,w12,#25
	ushr	v17.4s,v4.4s,#25
	ushr	v21.4s,v5.4s,#25
	ushr	v25.4s,v6.4s,#25
	ushr	v29.4s,v7.4s,#25
	sli	v17.4s,v4.4s,#7
	sli	v21.4s,v5.4s,#7
	sli	v25.4s,v6.4s,#7
	sli	v29.4s,v7.4s,#7
	add	v16.4s,v16.4s,v21.4s
	add	w5,w5,w10
	add	v20.4s,v20.4s,v25.4s
	add	w6,w6,w11
	add	v24.4s,v24.4s,v29.4s
	add	w7,w7,w12
	add	v28.4s,v28.4s,v17.4s
	add	w8,w8,w9
	eor	v31.16b,v31.16b,v16.16b
	eor	w21,w21,w5
	eor	v19.16b,v19.16b,v20.16b
	eor	w17,w17,w6
	eor	v23.16b,v23.16b,v24.16b
	eor	w19,w19,w7
	eor	v27.16b,v27.16b,v28.16b
	eor	w20,w20,w8
	rev32	v31.8h,v31.8h
	ror	w21,w21,#16
	rev32	v19.8h,v19.8h
	ror	w17,w17,#16
	rev32	v23.8h,v23.8h
	ror	w19,w19,#16
	rev32	v27.8h,v27.8h
	ror	w20,w20,#16
	add	v26.4s,v26.4s,v31.4s
	add	w15,w15,w21
	add	v30.4s,v30.4s,v19.4s
	add	w16,w16,w17
	add	v18.4s,v18.4s,v23.4s
	add	w13,w13,w19
	add	v22.4s,v22.4s,v27.4s
	add	w14,w14,w20
	eor	v4.16b,v21.16b,v26.16b
	eor	w10,w10,w15
	eor	v5.16b,v25.16b,v30.16b
	eor	w11,w11,w16
	eor	v6.16b,v29.16b,v18.16b
	eor	w12,w12,w13
	eor	v7.16b,v17.16b,v22.16b
	eor	w9,w9,w14
	ushr	v21.4s,v4.4s,#20
	ror	w10,w10,#20
	ushr	v25.4s,v5.4s,#20
	ror	w11,w11,#20
	ushr	v29.4s,v6.4s,#20
	ror	w12,w12,#20
	ushr	v17.4s,v7.4s,#20
	ror	w9,w9,#20
	sli	v21.4s,v4.4s,#12
	add	w5,w5,w10
	sli	v25.4s,v5.4s,#12
	add	w6,w6,w11
	sli	v29.4s,v6.4s,#12
	add	w7,w7,w12
	sli	v17.4s,v7.4s,#12
	add	w8,w8,w9
	add	v16.4s,v16.4s,v21.4s
	eor	w21,w21,w5
	add	v20.4s,v20.4s,v25.4s
	eor	w17,w17,w6
	add	v24.4s,v24.4s,v29.4s
	eor	w19,w19,w7
	add	v28.4s,v28.4s,v17.4s
	eor	w20,w20,w8
	eor	v4.16b,v31.16b,v16.16b
	ror	w21,w21,#24
	eor	v5.16b,v19.16b,v20.16b
	ror	w17,w17,#24
	eor	v6.16b,v23.16b,v24.16b
	ror	w19,w19,#24
	eor	v7.16b,v27.16b,v28.16b
	ror	w20,w20,#24
	tbl	v31.16b,{v4.16b},v9.16b
	add	w15,w15,w21
	tbl	v19.16b,{v5.16b},v9.16b
	add	w16,w16,w17
	tbl	v23.16b,{v6.16b},v9.16b
	add	w13,w13,w19
	tbl	v27.16b,{v7.16b},v9.16b
	add	w14,w14,w20
	add	v26.4s,v26.4s,v31.4s
	eor	w10,w10,w15
	add	v30.4s,v30.4s,v19.4s
	eor	w11,w11,w16
	add	v18.4s,v18.4s,v23.4s
	eor	w12,w12,w13
	add	v22.4s,v22.4s,v27.4s
	eor	w9,w9,w14
	eor	v4.16b,v21.16b,v26.16b
	ror	w10,w10,#25
	eor	v5.16b,v25.16b,v30.16b
	ror	w11,w11,#25
	eor	v6.16b,v29.16b,v18.16b
	ror	w12,w12,#25
	eor	v7.16b,v17.16b,v22.16b
	ror	w9,w9,#25
	ushr	v21.4s,v4.4s,#25
	ushr	v25.4s,v5.4s,#25
	ushr	v29.4s,v6.4s,#25
	ushr	v17.4s,v7.4s,#25
	sli	v21.4s,v4.4s,#7
	sli	v25.4s,v5.4s,#7
	sli	v29.4s,v6.4s,#7
	sli	v17.4s,v7.4s,#7
	cbnz	x4,.Loop_neon

	add	v19.4s,v19.4s,v8.4s

	zip1	v4.4s,v16.4s,v20.4s			// transpose data
	zip1	v5.4s,v24.4s,v28.4s
	zip2	v6.4s,v16.4s,v20.4s
	zip2	v7.4s,v24.4s,v28.4s
	zip1	v16.2d,v4.2d,v5.2d
	zip2	v20.2d,v4.2d,v5.2d
	zip1	v24.2d,v6.2d,v7.2d
	zip2	v28.2d,v6.2d,v7.2d

	zip1	v4.4s,v17.4s,v21.4s
	zip1	v5.4s,v25.4s,v29.4s
	zip2	v6.4s,v17.4s,v21.4s
	zip2	v7.4s,v25.4s,v29.4s
	zip1	v17.2d,v4.2d,v5.2d
	zip2	v21.2d,v4.2d,v5.2d
	zip1	v25.2d,v6.2d,v7.2d
	zip2	v29.2d,v6.2d,v7.2d

	zip1	v4.4s,v18.4s,v22.4s
	add	w5,w5,w22		// accumulate key block
	zip1	v5.4s,v26.4s,v30.4s
	add	x6,x6,x22,lsr#32
	zip2	v6.4s,v18.4s,v22.4s
	add	w7,w7,w23
	zip2	v7.4s,v26.4s,v30.4s
	add	x8,x8,x23,lsr#32
	zip1	v18.2d,v4.2d,v5.2d
	add	w9,w9,w24
	zip2	v22.2d,v4.2d,v5.2d
	add	x10,x10,x24,lsr#32
	zip1	v26.2d,v6.2d,v7.2d
	add	w11,w11,w25
	zip2	v30.2d,v6.2d,v7.2d
	add	x12,x12,x25,lsr#32

	zip1	v4.4s,v19.4s,v23.4s
	add	w13,w13,w26
	zip1	v5.4s,v27.4s,v31.4s
	add	x14,x14,x26,lsr#32
	zip2	v6.4s,v19.4s,v23.4s
	add	w15,w15,w27
	zip2	v7.4s,v27.4s,v31.4s
	add	x16,x16,x27,lsr#32
	zip1	v19.2d,v4.2d,v5.2d
	add	w17,w17,w28
	zip2	v23.2d,v4.2d,v5.2d
	add	x19,x19,x28,lsr#32
	zip1	v27.2d,v6.2d,v7.2d
	add	w20,w20,w30
	zip2	v31.2d,v6.2d,v7.2d
	add	x21,x21,x30,lsr#32

	b.lo	.Ltail_neon

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	v16.4s,v16.4s,v0.4s			// accumulate key block
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	v17.4s,v17.4s,v1.4s
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	v18.4s,v18.4s,v2.4s
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	v19.4s,v19.4s,v3.4s
	add	x1,x1,#64
#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
	eor	x5,x5,x6
	add	v20.4s,v20.4s,v0.4s
	eor	x7,x7,x8
	add	v21.4s,v21.4s,v1.4s
	eor	x9,x9,x10
	add	v22.4s,v22.4s,v2.4s
	eor	x11,x11,x12
	add	v23.4s,v23.4s,v3.4s
	eor	x13,x13,x14
	eor	v16.16b,v16.16b,v4.16b
	movi	v4.4s,#5
	eor	x15,x15,x16
	eor	v17.16b,v17.16b,v5.16b
	eor	x17,x17,x19
	eor	v18.16b,v18.16b,v6.16b
	eor	x20,x20,x21
	eor	v19.16b,v19.16b,v7.16b
	add	v8.4s,v8.4s,v4.4s			// += 5
	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#5			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64

	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
	add	v24.4s,v24.4s,v0.4s
	add	v25.4s,v25.4s,v1.4s
	add	v26.4s,v26.4s,v2.4s
	add	v27.4s,v27.4s,v3.4s
	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64

	eor	v20.16b,v20.16b,v4.16b
	eor	v21.16b,v21.16b,v5.16b
	eor	v22.16b,v22.16b,v6.16b
	eor	v23.16b,v23.16b,v7.16b
	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
	add	v28.4s,v28.4s,v0.4s
	add	v29.4s,v29.4s,v1.4s
	add	v30.4s,v30.4s,v2.4s
	add	v31.4s,v31.4s,v3.4s
	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64

	eor	v24.16b,v24.16b,v16.16b
	eor	v25.16b,v25.16b,v17.16b
	eor	v26.16b,v26.16b,v18.16b
	eor	v27.16b,v27.16b,v19.16b
	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64

	eor	v28.16b,v28.16b,v20.16b
	eor	v29.16b,v29.16b,v21.16b
	eor	v30.16b,v30.16b,v22.16b
	eor	v31.16b,v31.16b,v23.16b
	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64

	b.hi	.Loop_outer_neon

	ldp	d8,d9,[sp]			// meet ABI requirements

	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.inst	0xd50323bf			// autiasp
	ret

.align	4
.Ltail_neon:
	add	x2,x2,#320
	ldp	d8,d9,[sp]			// meet ABI requirements
	cmp	x2,#64
	b.lo	.Less_than_64

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	v16.4s,v16.4s,v0.4s			// accumulate key block
	stp	x9,x11,[x0,#16]
	add	v17.4s,v17.4s,v1.4s
	stp	x13,x15,[x0,#32]
	add	v18.4s,v18.4s,v2.4s
	stp	x17,x20,[x0,#48]
	add	v19.4s,v19.4s,v3.4s
	add	x0,x0,#64
	b.eq	.Ldone_neon
	sub	x2,x2,#64
	cmp	x2,#64
	b.lo	.Last_neon

	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
	eor	v16.16b,v16.16b,v4.16b
	eor	v17.16b,v17.16b,v5.16b
	eor	v18.16b,v18.16b,v6.16b
	eor	v19.16b,v19.16b,v7.16b
	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
	b.eq	.Ldone_neon

	add	v16.4s,v20.4s,v0.4s
	add	v17.4s,v21.4s,v1.4s
	sub	x2,x2,#64
	add	v18.4s,v22.4s,v2.4s
	cmp	x2,#64
	add	v19.4s,v23.4s,v3.4s
	b.lo	.Last_neon

	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
	eor	v20.16b,v16.16b,v4.16b
	eor	v21.16b,v17.16b,v5.16b
	eor	v22.16b,v18.16b,v6.16b
	eor	v23.16b,v19.16b,v7.16b
	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
	b.eq	.Ldone_neon

	add	v16.4s,v24.4s,v0.4s
	add	v17.4s,v25.4s,v1.4s
	sub	x2,x2,#64
	add	v18.4s,v26.4s,v2.4s
	cmp	x2,#64
	add	v19.4s,v27.4s,v3.4s
	b.lo	.Last_neon

	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
	eor	v24.16b,v16.16b,v4.16b
	eor	v25.16b,v17.16b,v5.16b
	eor	v26.16b,v18.16b,v6.16b
	eor	v27.16b,v19.16b,v7.16b
	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
	b.eq	.Ldone_neon

	add	v16.4s,v28.4s,v0.4s
	add	v17.4s,v29.4s,v1.4s
	add	v18.4s,v30.4s,v2.4s
	add	v19.4s,v31.4s,v3.4s
	sub	x2,x2,#64

.Last_neon:
	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]

	sub	x0,x0,#1
	add	x1,x1,x2
	add	x0,x0,x2
	add	x4,sp,x2
	neg	x2,x2

.Loop_tail_neon:
	ldrb	w10,[x1,x2]
	ldrb	w11,[x4,x2]
	add	x2,x2,#1
	eor	w10,w10,w11
	strb	w10,[x0,x2]
	cbnz	x2,.Loop_tail_neon

	stp	xzr,xzr,[sp,#0]
	stp	xzr,xzr,[sp,#16]
	stp	xzr,xzr,[sp,#32]
	stp	xzr,xzr,[sp,#48]

.Ldone_neon:
	ldp	x19,x20,[x29,#16]
	add	sp,sp,#64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.inst	0xd50323bf			// autiasp
	ret
.size	ChaCha20_neon,.-ChaCha20_neon
.type	ChaCha20_512_neon,%function
.align	5
ChaCha20_512_neon:
.inst	0xd503233f			// paciasp
	stp	x29,x30,[sp,#-96]!
	add	x29,sp,#0

	adr	x5,.Lsigma
	stp	x19,x20,[sp,#16]
	stp	x21,x22,[sp,#32]
	stp	x23,x24,[sp,#48]
	stp	x25,x26,[sp,#64]
	stp	x27,x28,[sp,#80]

.L512_or_more_neon:
	sub	sp,sp,#128+64

	eor	v7.16b,v7.16b,v7.16b
	ldp	x22,x23,[x5]		// load sigma
	ld1	{v0.4s},[x5],#16
	ldp	x24,x25,[x3]		// load key
	ldp	x26,x27,[x3,#16]
	ld1	{v1.4s,v2.4s},[x3]
	ldp	x28,x30,[x4]		// load counter
	ld1	{v3.4s},[x4]
	ld1	{v7.s}[0],[x5]
	add	x3,x5,#16			// .Lrot24
#ifdef	__AARCH64EB__
	rev64	v0.4s,v0.4s
	ror	x24,x24,#32
	ror	x25,x25,#32
	ror	x26,x26,#32
	ror	x27,x27,#32
	ror	x28,x28,#32
	ror	x30,x30,#32
#endif
	add	v3.4s,v3.4s,v7.4s		// += 1
	stp	q0,q1,[sp,#0]		// off-load key block, invariant part
	add	v3.4s,v3.4s,v7.4s		// not typo
	str	q2,[sp,#32]
	add	v4.4s,v3.4s,v7.4s
	add	v5.4s,v4.4s,v7.4s
	add	v6.4s,v5.4s,v7.4s
	shl	v7.4s,v7.4s,#2			// 1 -> 4

	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
	stp	d10,d11,[sp,#128+16]
	stp	d12,d13,[sp,#128+32]
	stp	d14,d15,[sp,#128+48]

	sub	x2,x2,#512			// not typo

.Loop_outer_512_neon:
	mov	v8.16b,v0.16b
	mov	v12.16b,v0.16b
	mov	v16.16b,v0.16b
	mov	v20.16b,v0.16b
	mov	v24.16b,v0.16b
	mov	v28.16b,v0.16b
	mov	v9.16b,v1.16b
	mov	w5,w22			// unpack key block
	mov	v13.16b,v1.16b
	lsr	x6,x22,#32
	mov	v17.16b,v1.16b
	mov	w7,w23
	mov	v21.16b,v1.16b
	lsr	x8,x23,#32
	mov	v25.16b,v1.16b
	mov	w9,w24
	mov	v29.16b,v1.16b
	lsr	x10,x24,#32
	mov	v11.16b,v3.16b
	mov	w11,w25
	mov	v15.16b,v4.16b
	lsr	x12,x25,#32
	mov	v19.16b,v5.16b
	mov	w13,w26
	mov	v23.16b,v6.16b
	lsr	x14,x26,#32
	mov	v10.16b,v2.16b
	mov	w15,w27
	mov	v14.16b,v2.16b
	lsr	x16,x27,#32
	add	v27.4s,v11.4s,v7.4s			// +4
	mov	w17,w28
	add	v31.4s,v15.4s,v7.4s			// +4
	lsr	x19,x28,#32
	mov	v18.16b,v2.16b
	mov	w20,w30
	mov	v22.16b,v2.16b
	lsr	x21,x30,#32
	mov	v26.16b,v2.16b
	stp	q3,q4,[sp,#48]		// off-load key block, variable part
	mov	v30.16b,v2.16b
	stp	q5,q6,[sp,#80]

	mov	x4,#5
	ld1	{v6.4s},[x3]
	subs	x2,x2,#512
.Loop_upper_neon:
	sub	x4,x4,#1
	add	v8.4s,v8.4s,v9.4s
	add	w5,w5,w9
	add	v12.4s,v12.4s,v13.4s
	add	w6,w6,w10
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w11
	add	v20.4s,v20.4s,v21.4s
	add	w8,w8,w12
	add	v24.4s,v24.4s,v25.4s
	eor	w17,w17,w5
	add	v28.4s,v28.4s,v29.4s
	eor	w19,w19,w6
	eor	v11.16b,v11.16b,v8.16b
	eor	w20,w20,w7
	eor	v15.16b,v15.16b,v12.16b
	eor	w21,w21,w8
	eor	v19.16b,v19.16b,v16.16b
	ror	w17,w17,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w19,w19,#16
	eor	v27.16b,v27.16b,v24.16b
	ror	w20,w20,#16
	eor	v31.16b,v31.16b,v28.16b
	ror	w21,w21,#16
	rev32	v11.8h,v11.8h
	add	w13,w13,w17
	rev32	v15.8h,v15.8h
	add	w14,w14,w19
	rev32	v19.8h,v19.8h
	add	w15,w15,w20
	rev32	v23.8h,v23.8h
	add	w16,w16,w21
	rev32	v27.8h,v27.8h
	eor	w9,w9,w13
	rev32	v31.8h,v31.8h
	eor	w10,w10,w14
	add	v10.4s,v10.4s,v11.4s
	eor	w11,w11,w15
	add	v14.4s,v14.4s,v15.4s
	eor	w12,w12,w16
	add	v18.4s,v18.4s,v19.4s
	ror	w9,w9,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w10,w10,#20
	add	v26.4s,v26.4s,v27.4s
	ror	w11,w11,#20
	add	v30.4s,v30.4s,v31.4s
	ror	w12,w12,#20
	eor	v0.16b,v9.16b,v10.16b
	add	w5,w5,w9
	eor	v1.16b,v13.16b,v14.16b
	add	w6,w6,w10
	eor	v2.16b,v17.16b,v18.16b
	add	w7,w7,w11
	eor	v3.16b,v21.16b,v22.16b
	add	w8,w8,w12
	eor	v4.16b,v25.16b,v26.16b
	eor	w17,w17,w5
	eor	v5.16b,v29.16b,v30.16b
	eor	w19,w19,w6
	ushr	v9.4s,v0.4s,#20
	eor	w20,w20,w7
	ushr	v13.4s,v1.4s,#20
	eor	w21,w21,w8
	ushr	v17.4s,v2.4s,#20
	ror	w17,w17,#24
	ushr	v21.4s,v3.4s,#20
	ror	w19,w19,#24
	ushr	v25.4s,v4.4s,#20
	ror	w20,w20,#24
	ushr	v29.4s,v5.4s,#20
	ror	w21,w21,#24
	sli	v9.4s,v0.4s,#12
	add	w13,w13,w17
	sli	v13.4s,v1.4s,#12
	add	w14,w14,w19
	sli	v17.4s,v2.4s,#12
	add	w15,w15,w20
	sli	v21.4s,v3.4s,#12
	add	w16,w16,w21
	sli	v25.4s,v4.4s,#12
	eor	w9,w9,w13
	sli	v29.4s,v5.4s,#12
	eor	w10,w10,w14
	add	v8.4s,v8.4s,v9.4s
	eor	w11,w11,w15
	add	v12.4s,v12.4s,v13.4s
	eor	w12,w12,w16
	add	v16.4s,v16.4s,v17.4s
	ror	w9,w9,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w10,w10,#25
	add	v24.4s,v24.4s,v25.4s
	ror	w11,w11,#25
	add	v28.4s,v28.4s,v29.4s
	ror	w12,w12,#25
	eor	v11.16b,v11.16b,v8.16b
	add	w5,w5,w10
	eor	v15.16b,v15.16b,v12.16b
	add	w6,w6,w11
	eor	v19.16b,v19.16b,v16.16b
	add	w7,w7,w12
	eor	v23.16b,v23.16b,v20.16b
	add	w8,w8,w9
	eor	v27.16b,v27.16b,v24.16b
	eor	w21,w21,w5
	eor	v31.16b,v31.16b,v28.16b
	eor	w17,w17,w6
	tbl	v11.16b,{v11.16b},v6.16b
	eor	w19,w19,w7
	tbl	v15.16b,{v15.16b},v6.16b
	eor	w20,w20,w8
	tbl	v19.16b,{v19.16b},v6.16b
	ror	w21,w21,#16
	tbl	v23.16b,{v23.16b},v6.16b
	ror	w17,w17,#16
	tbl	v27.16b,{v27.16b},v6.16b
	ror	w19,w19,#16
	tbl	v31.16b,{v31.16b},v6.16b
	ror	w20,w20,#16
	add	v10.4s,v10.4s,v11.4s
	add	w15,w15,w21
	add	v14.4s,v14.4s,v15.4s
	add	w16,w16,w17
	add	v18.4s,v18.4s,v19.4s
	add	w13,w13,w19
	add	v22.4s,v22.4s,v23.4s
	add	w14,w14,w20
	add	v26.4s,v26.4s,v27.4s
	eor	w10,w10,w15
	add	v30.4s,v30.4s,v31.4s
	eor	w11,w11,w16
	eor	v0.16b,v9.16b,v10.16b
	eor	w12,w12,w13
	eor	v1.16b,v13.16b,v14.16b
	eor	w9,w9,w14
	eor	v2.16b,v17.16b,v18.16b
	ror	w10,w10,#20
	eor	v3.16b,v21.16b,v22.16b
	ror	w11,w11,#20
	eor	v4.16b,v25.16b,v26.16b
	ror	w12,w12,#20
	eor	v5.16b,v29.16b,v30.16b
	ror	w9,w9,#20
	ushr	v9.4s,v0.4s,#25
	add	w5,w5,w10
	ushr	v13.4s,v1.4s,#25
	add	w6,w6,w11
	ushr	v17.4s,v2.4s,#25
	add	w7,w7,w12
	ushr	v21.4s,v3.4s,#25
	add	w8,w8,w9
	ushr	v25.4s,v4.4s,#25
	eor	w21,w21,w5
	ushr	v29.4s,v5.4s,#25
	eor	w17,w17,w6
	sli	v9.4s,v0.4s,#7
	eor	w19,w19,w7
	sli	v13.4s,v1.4s,#7
	eor	w20,w20,w8
	sli	v17.4s,v2.4s,#7
	ror	w21,w21,#24
	sli	v21.4s,v3.4s,#7
	ror	w17,w17,#24
	sli	v25.4s,v4.4s,#7
	ror	w19,w19,#24
	sli	v29.4s,v5.4s,#7
	ror	w20,w20,#24
	ext	v10.16b,v10.16b,v10.16b,#8
	add	w15,w15,w21
	ext	v14.16b,v14.16b,v14.16b,#8
	add	w16,w16,w17
	ext	v18.16b,v18.16b,v18.16b,#8
	add	w13,w13,w19
	ext	v22.16b,v22.16b,v22.16b,#8
	add	w14,w14,w20
	ext	v26.16b,v26.16b,v26.16b,#8
	eor	w10,w10,w15
	ext	v30.16b,v30.16b,v30.16b,#8
	eor	w11,w11,w16
	ext	v11.16b,v11.16b,v11.16b,#12
	eor	w12,w12,w13
	ext	v15.16b,v15.16b,v15.16b,#12
	eor	w9,w9,w14
	ext	v19.16b,v19.16b,v19.16b,#12
	ror	w10,w10,#25
	ext	v23.16b,v23.16b,v23.16b,#12
	ror	w11,w11,#25
	ext	v27.16b,v27.16b,v27.16b,#12
	ror	w12,w12,#25
	ext	v31.16b,v31.16b,v31.16b,#12
	ror	w9,w9,#25
	ext	v9.16b,v9.16b,v9.16b,#4
	ext	v13.16b,v13.16b,v13.16b,#4
	ext	v17.16b,v17.16b,v17.16b,#4
	ext	v21.16b,v21.16b,v21.16b,#4
	ext	v25.16b,v25.16b,v25.16b,#4
	ext	v29.16b,v29.16b,v29.16b,#4
	add	v8.4s,v8.4s,v9.4s
	add	w5,w5,w9
	add	v12.4s,v12.4s,v13.4s
	add	w6,w6,w10
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w11
	add	v20.4s,v20.4s,v21.4s
	add	w8,w8,w12
	add	v24.4s,v24.4s,v25.4s
	eor	w17,w17,w5
	add	v28.4s,v28.4s,v29.4s
	eor	w19,w19,w6
	eor	v11.16b,v11.16b,v8.16b
	eor	w20,w20,w7
	eor	v15.16b,v15.16b,v12.16b
	eor	w21,w21,w8
	eor	v19.16b,v19.16b,v16.16b
	ror	w17,w17,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w19,w19,#16
	eor	v27.16b,v27.16b,v24.16b
	ror	w20,w20,#16
	eor	v31.16b,v31.16b,v28.16b
	ror	w21,w21,#16
	rev32	v11.8h,v11.8h
	add	w13,w13,w17
	rev32	v15.8h,v15.8h
	add	w14,w14,w19
	rev32	v19.8h,v19.8h
	add	w15,w15,w20
	rev32	v23.8h,v23.8h
	add	w16,w16,w21
	rev32	v27.8h,v27.8h
	eor	w9,w9,w13
	rev32	v31.8h,v31.8h
	eor	w10,w10,w14
	add	v10.4s,v10.4s,v11.4s
	eor	w11,w11,w15
	add	v14.4s,v14.4s,v15.4s
	eor	w12,w12,w16
	add	v18.4s,v18.4s,v19.4s
	ror	w9,w9,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w10,w10,#20
	add	v26.4s,v26.4s,v27.4s
	ror	w11,w11,#20
	add	v30.4s,v30.4s,v31.4s
	ror	w12,w12,#20
	eor	v0.16b,v9.16b,v10.16b
	add	w5,w5,w9
	eor	v1.16b,v13.16b,v14.16b
	add	w6,w6,w10
	eor	v2.16b,v17.16b,v18.16b
	add	w7,w7,w11
	eor	v3.16b,v21.16b,v22.16b
	add	w8,w8,w12
	eor	v4.16b,v25.16b,v26.16b
	eor	w17,w17,w5
	eor	v5.16b,v29.16b,v30.16b
	eor	w19,w19,w6
	ushr	v9.4s,v0.4s,#20
	eor	w20,w20,w7
	ushr	v13.4s,v1.4s,#20
	eor	w21,w21,w8
	ushr	v17.4s,v2.4s,#20
	ror	w17,w17,#24
	ushr	v21.4s,v3.4s,#20
	ror	w19,w19,#24
	ushr	v25.4s,v4.4s,#20
	ror	w20,w20,#24
	ushr	v29.4s,v5.4s,#20
	ror	w21,w21,#24
	sli	v9.4s,v0.4s,#12
	add	w13,w13,w17
	sli	v13.4s,v1.4s,#12
	add	w14,w14,w19
	sli	v17.4s,v2.4s,#12
	add	w15,w15,w20
	sli	v21.4s,v3.4s,#12
	add	w16,w16,w21
	sli	v25.4s,v4.4s,#12
	eor	w9,w9,w13
	sli	v29.4s,v5.4s,#12
	eor	w10,w10,w14
	add	v8.4s,v8.4s,v9.4s
	eor	w11,w11,w15
	add	v12.4s,v12.4s,v13.4s
	eor	w12,w12,w16
	add	v16.4s,v16.4s,v17.4s
	ror	w9,w9,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w10,w10,#25
	add	v24.4s,v24.4s,v25.4s
	ror	w11,w11,#25
	add	v28.4s,v28.4s,v29.4s
	ror	w12,w12,#25
	eor	v11.16b,v11.16b,v8.16b
	add	w5,w5,w10
	eor	v15.16b,v15.16b,v12.16b
	add	w6,w6,w11
	eor	v19.16b,v19.16b,v16.16b
	add	w7,w7,w12
	eor	v23.16b,v23.16b,v20.16b
	add	w8,w8,w9
	eor	v27.16b,v27.16b,v24.16b
	eor	w21,w21,w5
	eor	v31.16b,v31.16b,v28.16b
	eor	w17,w17,w6
	tbl	v11.16b,{v11.16b},v6.16b
	eor	w19,w19,w7
	tbl	v15.16b,{v15.16b},v6.16b
	eor	w20,w20,w8
	tbl	v19.16b,{v19.16b},v6.16b
	ror	w21,w21,#16
	tbl	v23.16b,{v23.16b},v6.16b
	ror	w17,w17,#16
	tbl	v27.16b,{v27.16b},v6.16b
	ror	w19,w19,#16
	tbl	v31.16b,{v31.16b},v6.16b
	ror	w20,w20,#16
	add	v10.4s,v10.4s,v11.4s
	add	w15,w15,w21
	add	v14.4s,v14.4s,v15.4s
	add	w16,w16,w17
	add	v18.4s,v18.4s,v19.4s
	add	w13,w13,w19
	add	v22.4s,v22.4s,v23.4s
	add	w14,w14,w20
	add	v26.4s,v26.4s,v27.4s
	eor	w10,w10,w15
	add	v30.4s,v30.4s,v31.4s
	eor	w11,w11,w16
	eor	v0.16b,v9.16b,v10.16b
	eor	w12,w12,w13
	eor	v1.16b,v13.16b,v14.16b
	eor	w9,w9,w14
	eor	v2.16b,v17.16b,v18.16b
	ror	w10,w10,#20
	eor	v3.16b,v21.16b,v22.16b
	ror	w11,w11,#20
	eor	v4.16b,v25.16b,v26.16b
	ror	w12,w12,#20
	eor	v5.16b,v29.16b,v30.16b
	ror	w9,w9,#20
	ushr	v9.4s,v0.4s,#25
	add	w5,w5,w10
	ushr	v13.4s,v1.4s,#25
	add	w6,w6,w11
	ushr	v17.4s,v2.4s,#25
	add	w7,w7,w12
	ushr	v21.4s,v3.4s,#25
	add	w8,w8,w9
	ushr	v25.4s,v4.4s,#25
	eor	w21,w21,w5
	ushr	v29.4s,v5.4s,#25
	eor	w17,w17,w6
	sli	v9.4s,v0.4s,#7
	eor	w19,w19,w7
	sli	v13.4s,v1.4s,#7
	eor	w20,w20,w8
	sli	v17.4s,v2.4s,#7
	ror	w21,w21,#24
	sli	v21.4s,v3.4s,#7
	ror	w17,w17,#24
	sli	v25.4s,v4.4s,#7
	ror	w19,w19,#24
	sli	v29.4s,v5.4s,#7
	ror	w20,w20,#24
	ext	v10.16b,v10.16b,v10.16b,#8
	add	w15,w15,w21
	ext	v14.16b,v14.16b,v14.16b,#8
	add	w16,w16,w17
	ext	v18.16b,v18.16b,v18.16b,#8
	add	w13,w13,w19
	ext	v22.16b,v22.16b,v22.16b,#8
	add	w14,w14,w20
	ext	v26.16b,v26.16b,v26.16b,#8
	eor	w10,w10,w15
	ext	v30.16b,v30.16b,v30.16b,#8
	eor	w11,w11,w16
	ext	v11.16b,v11.16b,v11.16b,#4
	eor	w12,w12,w13
	ext	v15.16b,v15.16b,v15.16b,#4
	eor	w9,w9,w14
	ext	v19.16b,v19.16b,v19.16b,#4
	ror	w10,w10,#25
	ext	v23.16b,v23.16b,v23.16b,#4
	ror	w11,w11,#25
	ext	v27.16b,v27.16b,v27.16b,#4
	ror	w12,w12,#25
	ext	v31.16b,v31.16b,v31.16b,#4
	ror	w9,w9,#25
	ext	v9.16b,v9.16b,v9.16b,#12
	ext	v13.16b,v13.16b,v13.16b,#12
	ext	v17.16b,v17.16b,v17.16b,#12
	ext	v21.16b,v21.16b,v21.16b,#12
	ext	v25.16b,v25.16b,v25.16b,#12
	ext	v29.16b,v29.16b,v29.16b,#12
	cbnz	x4,.Loop_upper_neon

	add	w5,w5,w22		// accumulate key block
	add	x6,x6,x22,lsr#32
	add	w7,w7,w23
	add	x8,x8,x23,lsr#32
	add	w9,w9,w24
	add	x10,x10,x24,lsr#32
	add	w11,w11,w25
	add	x12,x12,x25,lsr#32
	add	w13,w13,w26
	add	x14,x14,x26,lsr#32
	add	w15,w15,w27
	add	x16,x16,x27,lsr#32
	add	w17,w17,w28
	add	x19,x19,x28,lsr#32
	add	w20,w20,w30
	add	x21,x21,x30,lsr#32

	add	x5,x5,x6,lsl#32	// pack
	add	x7,x7,x8,lsl#32
	ldp	x6,x8,[x1,#0]		// load input
	add	x9,x9,x10,lsl#32
	add	x11,x11,x12,lsl#32
	ldp	x10,x12,[x1,#16]
	add	x13,x13,x14,lsl#32
	add	x15,x15,x16,lsl#32
	ldp	x14,x16,[x1,#32]
	add	x17,x17,x19,lsl#32
	add	x20,x20,x21,lsl#32
	ldp	x19,x21,[x1,#48]
	add	x1,x1,#64
#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	x15,x15,x16
	eor	x17,x17,x19
	eor	x20,x20,x21

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#1			// increment counter
	mov	w5,w22			// unpack key block
	lsr	x6,x22,#32
	stp	x9,x11,[x0,#16]
	mov	w7,w23
	lsr	x8,x23,#32
	stp	x13,x15,[x0,#32]
	mov	w9,w24
	lsr	x10,x24,#32
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64
	mov	w11,w25
	lsr	x12,x25,#32
	mov	w13,w26
	lsr	x14,x26,#32
	mov	w15,w27
	lsr	x16,x27,#32
	mov	w17,w28
	lsr	x19,x28,#32
	mov	w20,w30
	lsr	x21,x30,#32

	mov	x4,#5
.Loop_lower_neon:
	sub	x4,x4,#1
	add	v8.4s,v8.4s,v9.4s
	add	w5,w5,w9
	add	v12.4s,v12.4s,v13.4s
	add	w6,w6,w10
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w11
	add	v20.4s,v20.4s,v21.4s
	add	w8,w8,w12
	add	v24.4s,v24.4s,v25.4s
	eor	w17,w17,w5
	add	v28.4s,v28.4s,v29.4s
	eor	w19,w19,w6
	eor	v11.16b,v11.16b,v8.16b
	eor	w20,w20,w7
	eor	v15.16b,v15.16b,v12.16b
	eor	w21,w21,w8
	eor	v19.16b,v19.16b,v16.16b
	ror	w17,w17,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w19,w19,#16
	eor	v27.16b,v27.16b,v24.16b
	ror	w20,w20,#16
	eor	v31.16b,v31.16b,v28.16b
	ror	w21,w21,#16
	rev32	v11.8h,v11.8h
	add	w13,w13,w17
	rev32	v15.8h,v15.8h
	add	w14,w14,w19
	rev32	v19.8h,v19.8h
	add	w15,w15,w20
	rev32	v23.8h,v23.8h
	add	w16,w16,w21
	rev32	v27.8h,v27.8h
	eor	w9,w9,w13
	rev32	v31.8h,v31.8h
	eor	w10,w10,w14
	add	v10.4s,v10.4s,v11.4s
	eor	w11,w11,w15
	add	v14.4s,v14.4s,v15.4s
	eor	w12,w12,w16
	add	v18.4s,v18.4s,v19.4s
	ror	w9,w9,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w10,w10,#20
	add	v26.4s,v26.4s,v27.4s
	ror	w11,w11,#20
	add	v30.4s,v30.4s,v31.4s
	ror	w12,w12,#20
	eor	v0.16b,v9.16b,v10.16b
	add	w5,w5,w9
	eor	v1.16b,v13.16b,v14.16b
	add	w6,w6,w10
	eor	v2.16b,v17.16b,v18.16b
	add	w7,w7,w11
	eor	v3.16b,v21.16b,v22.16b
	add	w8,w8,w12
	eor	v4.16b,v25.16b,v26.16b
	eor	w17,w17,w5
	eor	v5.16b,v29.16b,v30.16b
	eor	w19,w19,w6
	ushr	v9.4s,v0.4s,#20
	eor	w20,w20,w7
	ushr	v13.4s,v1.4s,#20
	eor	w21,w21,w8
	ushr	v17.4s,v2.4s,#20
	ror	w17,w17,#24
	ushr	v21.4s,v3.4s,#20
	ror	w19,w19,#24
	ushr	v25.4s,v4.4s,#20
	ror	w20,w20,#24
	ushr	v29.4s,v5.4s,#20
	ror	w21,w21,#24
	sli	v9.4s,v0.4s,#12
	add	w13,w13,w17
	sli	v13.4s,v1.4s,#12
	add	w14,w14,w19
	sli	v17.4s,v2.4s,#12
	add	w15,w15,w20
	sli	v21.4s,v3.4s,#12
	add	w16,w16,w21
	sli	v25.4s,v4.4s,#12
	eor	w9,w9,w13
	sli	v29.4s,v5.4s,#12
	eor	w10,w10,w14
	add	v8.4s,v8.4s,v9.4s
	eor	w11,w11,w15
	add	v12.4s,v12.4s,v13.4s
	eor	w12,w12,w16
	add	v16.4s,v16.4s,v17.4s
	ror	w9,w9,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w10,w10,#25
	add	v24.4s,v24.4s,v25.4s
	ror	w11,w11,#25
	add	v28.4s,v28.4s,v29.4s
	ror	w12,w12,#25
	eor	v11.16b,v11.16b,v8.16b
	add	w5,w5,w10
	eor	v15.16b,v15.16b,v12.16b
	add	w6,w6,w11
	eor	v19.16b,v19.16b,v16.16b
	add	w7,w7,w12
	eor	v23.16b,v23.16b,v20.16b
	add	w8,w8,w9
	eor	v27.16b,v27.16b,v24.16b
	eor	w21,w21,w5
	eor	v31.16b,v31.16b,v28.16b
	eor	w17,w17,w6
	tbl	v11.16b,{v11.16b},v6.16b
	eor	w19,w19,w7
	tbl	v15.16b,{v15.16b},v6.16b
	eor	w20,w20,w8
	tbl	v19.16b,{v19.16b},v6.16b
	ror	w21,w21,#16
	tbl	v23.16b,{v23.16b},v6.16b
	ror	w17,w17,#16
	tbl	v27.16b,{v27.16b},v6.16b
	ror	w19,w19,#16
	tbl	v31.16b,{v31.16b},v6.16b
	ror	w20,w20,#16
	add	v10.4s,v10.4s,v11.4s
	add	w15,w15,w21
	add	v14.4s,v14.4s,v15.4s
	add	w16,w16,w17
	add	v18.4s,v18.4s,v19.4s
	add	w13,w13,w19
	add	v22.4s,v22.4s,v23.4s
	add	w14,w14,w20
	add	v26.4s,v26.4s,v27.4s
	eor	w10,w10,w15
	add	v30.4s,v30.4s,v31.4s
	eor	w11,w11,w16
	eor	v0.16b,v9.16b,v10.16b
	eor	w12,w12,w13
	eor	v1.16b,v13.16b,v14.16b
	eor	w9,w9,w14
	eor	v2.16b,v17.16b,v18.16b
	ror	w10,w10,#20
	eor	v3.16b,v21.16b,v22.16b
	ror	w11,w11,#20
	eor	v4.16b,v25.16b,v26.16b
	ror	w12,w12,#20
	eor	v5.16b,v29.16b,v30.16b
	ror	w9,w9,#20
	ushr	v9.4s,v0.4s,#25
	add	w5,w5,w10
	ushr	v13.4s,v1.4s,#25
	add	w6,w6,w11
	ushr	v17.4s,v2.4s,#25
	add	w7,w7,w12
	ushr	v21.4s,v3.4s,#25
	add	w8,w8,w9
	ushr	v25.4s,v4.4s,#25
	eor	w21,w21,w5
	ushr	v29.4s,v5.4s,#25
	eor	w17,w17,w6
	sli	v9.4s,v0.4s,#7
	eor	w19,w19,w7
	sli	v13.4s,v1.4s,#7
	eor	w20,w20,w8
	sli	v17.4s,v2.4s,#7
	ror	w21,w21,#24
	sli	v21.4s,v3.4s,#7
	ror	w17,w17,#24
	sli	v25.4s,v4.4s,#7
	ror	w19,w19,#24
	sli	v29.4s,v5.4s,#7
	ror	w20,w20,#24
	ext	v10.16b,v10.16b,v10.16b,#8
	add	w15,w15,w21
	ext	v14.16b,v14.16b,v14.16b,#8
	add	w16,w16,w17
	ext	v18.16b,v18.16b,v18.16b,#8
	add	w13,w13,w19
	ext	v22.16b,v22.16b,v22.16b,#8
	add	w14,w14,w20
	ext	v26.16b,v26.16b,v26.16b,#8
	eor	w10,w10,w15
	ext	v30.16b,v30.16b,v30.16b,#8
	eor	w11,w11,w16
	ext	v11.16b,v11.16b,v11.16b,#12
	eor	w12,w12,w13
	ext	v15.16b,v15.16b,v15.16b,#12
	eor	w9,w9,w14
	ext	v19.16b,v19.16b,v19.16b,#12
	ror	w10,w10,#25
	ext	v23.16b,v23.16b,v23.16b,#12
	ror	w11,w11,#25
	ext	v27.16b,v27.16b,v27.16b,#12
	ror	w12,w12,#25
	ext	v31.16b,v31.16b,v31.16b,#12
	ror	w9,w9,#25
	ext	v9.16b,v9.16b,v9.16b,#4
	ext	v13.16b,v13.16b,v13.16b,#4
	ext	v17.16b,v17.16b,v17.16b,#4
	ext	v21.16b,v21.16b,v21.16b,#4
	ext	v25.16b,v25.16b,v25.16b,#4
	ext	v29.16b,v29.16b,v29.16b,#4
	add	v8.4s,v8.4s,v9.4s
	add	w5,w5,w9
	add	v12.4s,v12.4s,v13.4s
	add	w6,w6,w10
	add	v16.4s,v16.4s,v17.4s
	add	w7,w7,w11
	add	v20.4s,v20.4s,v21.4s
	add	w8,w8,w12
	add	v24.4s,v24.4s,v25.4s
	eor	w17,w17,w5
	add	v28.4s,v28.4s,v29.4s
	eor	w19,w19,w6
	eor	v11.16b,v11.16b,v8.16b
	eor	w20,w20,w7
	eor	v15.16b,v15.16b,v12.16b
	eor	w21,w21,w8
	eor	v19.16b,v19.16b,v16.16b
	ror	w17,w17,#16
	eor	v23.16b,v23.16b,v20.16b
	ror	w19,w19,#16
	eor	v27.16b,v27.16b,v24.16b
	ror	w20,w20,#16
	eor	v31.16b,v31.16b,v28.16b
	ror	w21,w21,#16
	rev32	v11.8h,v11.8h
	add	w13,w13,w17
	rev32	v15.8h,v15.8h
	add	w14,w14,w19
	rev32	v19.8h,v19.8h
	add	w15,w15,w20
	rev32	v23.8h,v23.8h
	add	w16,w16,w21
	rev32	v27.8h,v27.8h
	eor	w9,w9,w13
	rev32	v31.8h,v31.8h
	eor	w10,w10,w14
	add	v10.4s,v10.4s,v11.4s
	eor	w11,w11,w15
	add	v14.4s,v14.4s,v15.4s
	eor	w12,w12,w16
	add	v18.4s,v18.4s,v19.4s
	ror	w9,w9,#20
	add	v22.4s,v22.4s,v23.4s
	ror	w10,w10,#20
	add	v26.4s,v26.4s,v27.4s
	ror	w11,w11,#20
	add	v30.4s,v30.4s,v31.4s
	ror	w12,w12,#20
	eor	v0.16b,v9.16b,v10.16b
	add	w5,w5,w9
	eor	v1.16b,v13.16b,v14.16b
	add	w6,w6,w10
	eor	v2.16b,v17.16b,v18.16b
	add	w7,w7,w11
	eor	v3.16b,v21.16b,v22.16b
	add	w8,w8,w12
	eor	v4.16b,v25.16b,v26.16b
	eor	w17,w17,w5
	eor	v5.16b,v29.16b,v30.16b
	eor	w19,w19,w6
	ushr	v9.4s,v0.4s,#20
	eor	w20,w20,w7
	ushr	v13.4s,v1.4s,#20
	eor	w21,w21,w8
	ushr	v17.4s,v2.4s,#20
	ror	w17,w17,#24
	ushr	v21.4s,v3.4s,#20
	ror	w19,w19,#24
	ushr	v25.4s,v4.4s,#20
	ror	w20,w20,#24
	ushr	v29.4s,v5.4s,#20
	ror	w21,w21,#24
	sli	v9.4s,v0.4s,#12
	add	w13,w13,w17
	sli	v13.4s,v1.4s,#12
	add	w14,w14,w19
	sli	v17.4s,v2.4s,#12
	add	w15,w15,w20
	sli	v21.4s,v3.4s,#12
	add	w16,w16,w21
	sli	v25.4s,v4.4s,#12
	eor	w9,w9,w13
	sli	v29.4s,v5.4s,#12
	eor	w10,w10,w14
	add	v8.4s,v8.4s,v9.4s
	eor	w11,w11,w15
	add	v12.4s,v12.4s,v13.4s
	eor	w12,w12,w16
	add	v16.4s,v16.4s,v17.4s
	ror	w9,w9,#25
	add	v20.4s,v20.4s,v21.4s
	ror	w10,w10,#25
	add	v24.4s,v24.4s,v25.4s
	ror	w11,w11,#25
	add	v28.4s,v28.4s,v29.4s
	ror	w12,w12,#25
	eor	v11.16b,v11.16b,v8.16b
	add	w5,w5,w10
	eor	v15.16b,v15.16b,v12.16b
	add	w6,w6,w11
	eor	v19.16b,v19.16b,v16.16b
	add	w7,w7,w12
	eor	v23.16b,v23.16b,v20.16b
	add	w8,w8,w9
	eor	v27.16b,v27.16b,v24.16b
	eor	w21,w21,w5
	eor	v31.16b,v31.16b,v28.16b
	eor	w17,w17,w6
	tbl	v11.16b,{v11.16b},v6.16b
	eor	w19,w19,w7
	tbl	v15.16b,{v15.16b},v6.16b
	eor	w20,w20,w8
	tbl	v19.16b,{v19.16b},v6.16b
	ror	w21,w21,#16
	tbl	v23.16b,{v23.16b},v6.16b
	ror	w17,w17,#16
	tbl	v27.16b,{v27.16b},v6.16b
	ror	w19,w19,#16
	tbl	v31.16b,{v31.16b},v6.16b
	ror	w20,w20,#16
	add	v10.4s,v10.4s,v11.4s
	add	w15,w15,w21
	add	v14.4s,v14.4s,v15.4s
	add	w16,w16,w17
	add	v18.4s,v18.4s,v19.4s
	add	w13,w13,w19
	add	v22.4s,v22.4s,v23.4s
	add	w14,w14,w20
	add	v26.4s,v26.4s,v27.4s
	eor	w10,w10,w15
	add	v30.4s,v30.4s,v31.4s
	eor	w11,w11,w16
	eor	v0.16b,v9.16b,v10.16b
	eor	w12,w12,w13
	eor	v1.16b,v13.16b,v14.16b
	eor	w9,w9,w14
	eor	v2.16b,v17.16b,v18.16b
	ror	w10,w10,#20
	eor	v3.16b,v21.16b,v22.16b
	ror	w11,w11,#20
	eor	v4.16b,v25.16b,v26.16b
	ror	w12,w12,#20
	eor	v5.16b,v29.16b,v30.16b
	ror	w9,w9,#20
	ushr	v9.4s,v0.4s,#25
	add	w5,w5,w10
	ushr	v13.4s,v1.4s,#25
	add	w6,w6,w11
	ushr	v17.4s,v2.4s,#25
	add	w7,w7,w12
	ushr	v21.4s,v3.4s,#25
	add	w8,w8,w9
	ushr	v25.4s,v4.4s,#25
	eor	w21,w21,w5
	ushr	v29.4s,v5.4s,#25
	eor	w17,w17,w6
	sli	v9.4s,v0.4s,#7
	eor	w19,w19,w7
	sli	v13.4s,v1.4s,#7
	eor	w20,w20,w8
	sli	v17.4s,v2.4s,#7
	ror	w21,w21,#24
	sli	v21.4s,v3.4s,#7
	ror	w17,w17,#24
	sli	v25.4s,v4.4s,#7
	ror	w19,w19,#24
	sli	v29.4s,v5.4s,#7
	ror	w20,w20,#24
	ext	v10.16b,v10.16b,v10.16b,#8
	add	w15,w15,w21
	ext	v14.16b,v14.16b,v14.16b,#8
	add	w16,w16,w17
	ext	v18.16b,v18.16b,v18.16b,#8
	add	w13,w13,w19
	ext	v22.16b,v22.16b,v22.16b,#8
	add	w14,w14,w20
	ext	v26.16b,v26.16b,v26.16b,#8
	eor	w10,w10,w15
	ext	v30.16b,v30.16b,v30.16b,#8
	eor	w11,w11,w16
	ext	v11.16b,v11.16b,v11.16b,#4
	eor	w12,w12,w13
	ext	v15.16b,v15.16b,v15.16b,#4
	eor	w9,w9,w14
	ext	v19.16b,v19.16b,v19.16b,#4
	ror	w10,w10,#25
	ext	v23.16b,v23.16b,v23.16b,#4
	ror	w11,w11,#25
	ext	v27.16b,v27.16b,v27.16b,#4
	ror	w12,w12,#25
	ext	v31.16b,v31.16b,v31.16b,#4
	ror	w9,w9,#25
	ext	v9.16b,v9.16b,v9.16b,#12
	ext	v13.16b,v13.16b,v13.16b,#12
	ext	v17.16b,v17.16b,v17.16b,#12
	ext	v21.16b,v21.16b,v21.16b,#12
	ext	v25.16b,v25.16b,v25.16b,#12
	ext	v29.16b,v29.16b,v29.16b,#12
	cbnz	x4,.Loop_lower_neon

	add	w5,w5,w22		// accumulate key block
	ldp	q0,q1,[sp,#0]
	add	x6,x6,x22,lsr#32
	ldp	q2,q3,[sp,#32]
	add	w7,w7,w23
	ldp	q4,q5,[sp,#64]
	add	x8,x8,x23,lsr#32
	ldr	q6,[sp,#96]
	add	v8.4s,v8.4s,v0.4s
	add	w9,w9,w24
	add	v12.4s,v12.4s,v0.4s
	add	x10,x10,x24,lsr#32
	add	v16.4s,v16.4s,v0.4s
	add	w11,w11,w25
	add	v20.4s,v20.4s,v0.4s
	add	x12,x12,x25,lsr#32
	add	v24.4s,v24.4s,v0.4s
	add	w13,w13,w26
	add	v28.4s,v28.4s,v0.4s
	add	x14,x14,x26,lsr#32
	add	v10.4s,v10.4s,v2.4s
	add	w15,w15,w27
	add	v14.4s,v14.4s,v2.4s
	add	x16,x16,x27,lsr#32
	add	v18.4s,v18.4s,v2.4s
	add	w17,w17,w28
	add	v22.4s,v22.4s,v2.4s
	add	x19,x19,x28,lsr#32
	add	v26.4s,v26.4s,v2.4s
	add	w20,w20,w30
	add	v30.4s,v30.4s,v2.4s
	add	x21,x21,x30,lsr#32
	add	v27.4s,v27.4s,v7.4s			// +4
	add	x5,x5,x6,lsl#32	// pack
	add	v31.4s,v31.4s,v7.4s			// +4
	add	x7,x7,x8,lsl#32
	add	v11.4s,v11.4s,v3.4s
	ldp	x6,x8,[x1,#0]		// load input
	add	v15.4s,v15.4s,v4.4s
	add	x9,x9,x10,lsl#32
	add	v19.4s,v19.4s,v5.4s
	add	x11,x11,x12,lsl#32
	add	v23.4s,v23.4s,v6.4s
	ldp	x10,x12,[x1,#16]
	add	v27.4s,v27.4s,v3.4s
	add	x13,x13,x14,lsl#32
	add	v31.4s,v31.4s,v4.4s
	add	x15,x15,x16,lsl#32
	add	v9.4s,v9.4s,v1.4s
	ldp	x14,x16,[x1,#32]
	add	v13.4s,v13.4s,v1.4s
	add	x17,x17,x19,lsl#32
	add	v17.4s,v17.4s,v1.4s
	add	x20,x20,x21,lsl#32
	add	v21.4s,v21.4s,v1.4s
	ldp	x19,x21,[x1,#48]
	add	v25.4s,v25.4s,v1.4s
	add	x1,x1,#64
	add	v29.4s,v29.4s,v1.4s

#ifdef	__AARCH64EB__
	rev	x5,x5
	rev	x7,x7
	rev	x9,x9
	rev	x11,x11
	rev	x13,x13
	rev	x15,x15
	rev	x17,x17
	rev	x20,x20
#endif
	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
	eor	x5,x5,x6
	eor	x7,x7,x8
	eor	x9,x9,x10
	eor	x11,x11,x12
	eor	x13,x13,x14
	eor	v8.16b,v8.16b,v0.16b
	eor	x15,x15,x16
	eor	v9.16b,v9.16b,v1.16b
	eor	x17,x17,x19
	eor	v10.16b,v10.16b,v2.16b
	eor	x20,x20,x21
	eor	v11.16b,v11.16b,v3.16b
	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64

	stp	x5,x7,[x0,#0]		// store output
	add	x28,x28,#7			// increment counter
	stp	x9,x11,[x0,#16]
	stp	x13,x15,[x0,#32]
	stp	x17,x20,[x0,#48]
	add	x0,x0,#64
	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64

	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
	eor	v12.16b,v12.16b,v0.16b
	eor	v13.16b,v13.16b,v1.16b
	eor	v14.16b,v14.16b,v2.16b
	eor	v15.16b,v15.16b,v3.16b
	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64

	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
	eor	v16.16b,v16.16b,v8.16b
	ldp	q0,q1,[sp,#0]
	eor	v17.16b,v17.16b,v9.16b
	ldp	q2,q3,[sp,#32]
	eor	v18.16b,v18.16b,v10.16b
	eor	v19.16b,v19.16b,v11.16b
	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64

	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
	eor	v20.16b,v20.16b,v12.16b
	eor	v21.16b,v21.16b,v13.16b
	eor	v22.16b,v22.16b,v14.16b
	eor	v23.16b,v23.16b,v15.16b
	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64

	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
	eor	v24.16b,v24.16b,v16.16b
	eor	v25.16b,v25.16b,v17.16b
	eor	v26.16b,v26.16b,v18.16b
	eor	v27.16b,v27.16b,v19.16b
	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64

	shl	v8.4s,v7.4s,#1			// 4 -> 8
	eor	v28.16b,v28.16b,v20.16b
	eor	v29.16b,v29.16b,v21.16b
	eor	v30.16b,v30.16b,v22.16b
	eor	v31.16b,v31.16b,v23.16b
	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64

	add	v3.4s,v3.4s,v8.4s			// += 8
	add	v4.4s,v4.4s,v8.4s
	add	v5.4s,v5.4s,v8.4s
	add	v6.4s,v6.4s,v8.4s

	b.hs	.Loop_outer_512_neon

	adds	x2,x2,#512
	ushr	v7.4s,v7.4s,#1			// 4 -> 2

	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
	ldp	d12,d13,[sp,#128+32]
	ldp	d14,d15,[sp,#128+48]

	stp	q0,q0,[sp,#0]		// wipe off-load area
	stp	q0,q0,[sp,#32]
	stp	q0,q0,[sp,#64]

	b.eq	.Ldone_512_neon

	sub	x3,x3,#16			// .Lone
	cmp	x2,#192
	add	sp,sp,#128
	sub	v3.4s,v3.4s,v7.4s		// -= 2
	ld1	{v8.4s,v9.4s},[x3]
	b.hs	.Loop_outer_neon

	ldp	d8,d9,[sp,#0]			// meet ABI requirements
	eor	v1.16b,v1.16b,v1.16b
	eor	v2.16b,v2.16b,v2.16b
	eor	v3.16b,v3.16b,v3.16b
	eor	v4.16b,v4.16b,v4.16b
	eor	v5.16b,v5.16b,v5.16b
	eor	v6.16b,v6.16b,v6.16b
	b	.Loop_outer

.Ldone_512_neon:
	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
	ldp	x19,x20,[x29,#16]
	add	sp,sp,#128+64
	ldp	x21,x22,[x29,#32]
	ldp	x23,x24,[x29,#48]
	ldp	x25,x26,[x29,#64]
	ldp	x27,x28,[x29,#80]
	ldp	x29,x30,[sp],#96
.inst	0xd50323bf			// autiasp
	ret
.size	ChaCha20_512_neon,.-ChaCha20_512_neon