216 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
			
		
		
	
	
			216 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			ArmAsm
		
	
	
| // +build !appengine
 | |
| // +build gc
 | |
| // +build !purego
 | |
| 
 | |
| #include "textflag.h"
 | |
| 
 | |
| // Register allocation:
 | |
| // AX	h
 | |
| // SI	pointer to advance through b
 | |
| // DX	n
 | |
| // BX	loop end
 | |
| // R8	v1, k1
 | |
| // R9	v2
 | |
| // R10	v3
 | |
| // R11	v4
 | |
| // R12	tmp
 | |
| // R13	prime1v
 | |
| // R14	prime2v
 | |
| // DI	prime4v
 | |
| 
 | |
| // round reads from and advances the buffer pointer in SI.
 | |
| // It assumes that R13 has prime1v and R14 has prime2v.
 | |
| #define round(r) \
 | |
| 	MOVQ  (SI), R12 \
 | |
| 	ADDQ  $8, SI    \
 | |
| 	IMULQ R14, R12  \
 | |
| 	ADDQ  R12, r    \
 | |
| 	ROLQ  $31, r    \
 | |
| 	IMULQ R13, r
 | |
| 
 | |
| // mergeRound applies a merge round on the two registers acc and val.
 | |
| // It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
 | |
| #define mergeRound(acc, val) \
 | |
| 	IMULQ R14, val \
 | |
| 	ROLQ  $31, val \
 | |
| 	IMULQ R13, val \
 | |
| 	XORQ  val, acc \
 | |
| 	IMULQ R13, acc \
 | |
| 	ADDQ  DI, acc
 | |
| 
 | |
| // func Sum64(b []byte) uint64
 | |
| TEXT ·Sum64(SB), NOSPLIT, $0-32
 | |
| 	// Load fixed primes.
 | |
| 	MOVQ ·prime1v(SB), R13
 | |
| 	MOVQ ·prime2v(SB), R14
 | |
| 	MOVQ ·prime4v(SB), DI
 | |
| 
 | |
| 	// Load slice.
 | |
| 	MOVQ b_base+0(FP), SI
 | |
| 	MOVQ b_len+8(FP), DX
 | |
| 	LEAQ (SI)(DX*1), BX
 | |
| 
 | |
| 	// The first loop limit will be len(b)-32.
 | |
| 	SUBQ $32, BX
 | |
| 
 | |
| 	// Check whether we have at least one block.
 | |
| 	CMPQ DX, $32
 | |
| 	JLT  noBlocks
 | |
| 
 | |
| 	// Set up initial state (v1, v2, v3, v4).
 | |
| 	MOVQ R13, R8
 | |
| 	ADDQ R14, R8
 | |
| 	MOVQ R14, R9
 | |
| 	XORQ R10, R10
 | |
| 	XORQ R11, R11
 | |
| 	SUBQ R13, R11
 | |
| 
 | |
| 	// Loop until SI > BX.
 | |
| blockLoop:
 | |
| 	round(R8)
 | |
| 	round(R9)
 | |
| 	round(R10)
 | |
| 	round(R11)
 | |
| 
 | |
| 	CMPQ SI, BX
 | |
| 	JLE  blockLoop
 | |
| 
 | |
| 	MOVQ R8, AX
 | |
| 	ROLQ $1, AX
 | |
| 	MOVQ R9, R12
 | |
| 	ROLQ $7, R12
 | |
| 	ADDQ R12, AX
 | |
| 	MOVQ R10, R12
 | |
| 	ROLQ $12, R12
 | |
| 	ADDQ R12, AX
 | |
| 	MOVQ R11, R12
 | |
| 	ROLQ $18, R12
 | |
| 	ADDQ R12, AX
 | |
| 
 | |
| 	mergeRound(AX, R8)
 | |
| 	mergeRound(AX, R9)
 | |
| 	mergeRound(AX, R10)
 | |
| 	mergeRound(AX, R11)
 | |
| 
 | |
| 	JMP afterBlocks
 | |
| 
 | |
| noBlocks:
 | |
| 	MOVQ ·prime5v(SB), AX
 | |
| 
 | |
| afterBlocks:
 | |
| 	ADDQ DX, AX
 | |
| 
 | |
| 	// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
 | |
| 	ADDQ $24, BX
 | |
| 
 | |
| 	CMPQ SI, BX
 | |
| 	JG   fourByte
 | |
| 
 | |
| wordLoop:
 | |
| 	// Calculate k1.
 | |
| 	MOVQ  (SI), R8
 | |
| 	ADDQ  $8, SI
 | |
| 	IMULQ R14, R8
 | |
| 	ROLQ  $31, R8
 | |
| 	IMULQ R13, R8
 | |
| 
 | |
| 	XORQ  R8, AX
 | |
| 	ROLQ  $27, AX
 | |
| 	IMULQ R13, AX
 | |
| 	ADDQ  DI, AX
 | |
| 
 | |
| 	CMPQ SI, BX
 | |
| 	JLE  wordLoop
 | |
| 
 | |
| fourByte:
 | |
| 	ADDQ $4, BX
 | |
| 	CMPQ SI, BX
 | |
| 	JG   singles
 | |
| 
 | |
| 	MOVL  (SI), R8
 | |
| 	ADDQ  $4, SI
 | |
| 	IMULQ R13, R8
 | |
| 	XORQ  R8, AX
 | |
| 
 | |
| 	ROLQ  $23, AX
 | |
| 	IMULQ R14, AX
 | |
| 	ADDQ  ·prime3v(SB), AX
 | |
| 
 | |
| singles:
 | |
| 	ADDQ $4, BX
 | |
| 	CMPQ SI, BX
 | |
| 	JGE  finalize
 | |
| 
 | |
| singlesLoop:
 | |
| 	MOVBQZX (SI), R12
 | |
| 	ADDQ    $1, SI
 | |
| 	IMULQ   ·prime5v(SB), R12
 | |
| 	XORQ    R12, AX
 | |
| 
 | |
| 	ROLQ  $11, AX
 | |
| 	IMULQ R13, AX
 | |
| 
 | |
| 	CMPQ SI, BX
 | |
| 	JL   singlesLoop
 | |
| 
 | |
| finalize:
 | |
| 	MOVQ  AX, R12
 | |
| 	SHRQ  $33, R12
 | |
| 	XORQ  R12, AX
 | |
| 	IMULQ R14, AX
 | |
| 	MOVQ  AX, R12
 | |
| 	SHRQ  $29, R12
 | |
| 	XORQ  R12, AX
 | |
| 	IMULQ ·prime3v(SB), AX
 | |
| 	MOVQ  AX, R12
 | |
| 	SHRQ  $32, R12
 | |
| 	XORQ  R12, AX
 | |
| 
 | |
| 	MOVQ AX, ret+24(FP)
 | |
| 	RET
 | |
| 
 | |
| // writeBlocks uses the same registers as above except that it uses AX to store
 | |
| // the d pointer.
 | |
| 
 | |
| // func writeBlocks(d *Digest, b []byte) int
 | |
| TEXT ·writeBlocks(SB), NOSPLIT, $0-40
 | |
| 	// Load fixed primes needed for round.
 | |
| 	MOVQ ·prime1v(SB), R13
 | |
| 	MOVQ ·prime2v(SB), R14
 | |
| 
 | |
| 	// Load slice.
 | |
| 	MOVQ b_base+8(FP), SI
 | |
| 	MOVQ b_len+16(FP), DX
 | |
| 	LEAQ (SI)(DX*1), BX
 | |
| 	SUBQ $32, BX
 | |
| 
 | |
| 	// Load vN from d.
 | |
| 	MOVQ d+0(FP), AX
 | |
| 	MOVQ 0(AX), R8   // v1
 | |
| 	MOVQ 8(AX), R9   // v2
 | |
| 	MOVQ 16(AX), R10 // v3
 | |
| 	MOVQ 24(AX), R11 // v4
 | |
| 
 | |
| 	// We don't need to check the loop condition here; this function is
 | |
| 	// always called with at least one block of data to process.
 | |
| blockLoop:
 | |
| 	round(R8)
 | |
| 	round(R9)
 | |
| 	round(R10)
 | |
| 	round(R11)
 | |
| 
 | |
| 	CMPQ SI, BX
 | |
| 	JLE  blockLoop
 | |
| 
 | |
| 	// Copy vN back to d.
 | |
| 	MOVQ R8, 0(AX)
 | |
| 	MOVQ R9, 8(AX)
 | |
| 	MOVQ R10, 16(AX)
 | |
| 	MOVQ R11, 24(AX)
 | |
| 
 | |
| 	// The number of bytes written is SI minus the old base pointer.
 | |
| 	SUBQ b_base+8(FP), SI
 | |
| 	MOVQ SI, ret+32(FP)
 | |
| 
 | |
| 	RET
 |