// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI // The implementation uses some optimization as described in: // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication // Instruction and its Usage for Computing the GCM Mode rev. 2.02 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and // Hardware #include "textflag.h" #define B0 X0 #define B1 X1 #define B2 X2 #define B3 X3 #define B4 X4 #define B5 X5 #define B6 X6 #define B7 X7 #define ACC0 X8 #define ACC1 X9 #define ACCM X10 #define T0 X11 #define T1 X12 #define T2 X13 #define POLY X14 #define BSWAP X15 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff DATA andMask<>+0x08(SB)/8, $0x0000000000000000 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff DATA andMask<>+0x18(SB)/8, $0x0000000000000000 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff DATA andMask<>+0x28(SB)/8, $0x0000000000000000 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff DATA andMask<>+0x38(SB)/8, $0x0000000000000000 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff DATA andMask<>+0x48(SB)/8, $0x0000000000000000 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff DATA andMask<>+0x58(SB)/8, $0x0000000000000000 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff DATA andMask<>+0x68(SB)/8, $0x0000000000000000 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff DATA andMask<>+0x78(SB)/8, $0x0000000000000000 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff DATA andMask<>+0x88(SB)/8, $0x00000000000000ff DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff DATA andMask<>+0x98(SB)/8, $0x000000000000ffff DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) TEXT ·gcmAesFinish(SB),NOSPLIT,$0 #define pTbl DI #define tMsk SI #define tPtr DX #define plen AX #define dlen CX MOVQ productTable+0(FP), pTbl MOVQ tagMask+8(FP), tMsk MOVQ T+16(FP), tPtr MOVQ pLen+24(FP), plen MOVQ dLen+32(FP), dlen MOVOU (tPtr), ACC0 MOVOU (tMsk), T2 MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY SHLQ $3, plen SHLQ $3, dlen MOVQ plen, B0 PINSRQ $1, dlen, B0 PXOR ACC0, B0 MOVOU (16*14)(pTbl), ACC0 MOVOU (16*15)(pTbl), ACCM MOVOU ACC0, ACC1 PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 MOVOU POLY, T0 PCLMULQDQ $0x01, ACC0, T0 PSHUFD $78, ACC0, ACC0 PXOR T0, ACC0 MOVOU POLY, T0 PCLMULQDQ $0x01, ACC0, T0 PSHUFD $78, ACC0, ACC0 PXOR T0, ACC0 PXOR ACC1, ACC0 PSHUFB BSWAP, ACC0 PXOR T2, ACC0 MOVOU ACC0, (tPtr) RET #undef pTbl #undef tMsk #undef tPtr #undef plen #undef dlen // func gcmAesInit(productTable *[256]byte, ks []uint32) TEXT ·gcmAesInit(SB),NOSPLIT,$0 #define dst DI #define KS SI #define NR DX MOVQ productTable+0(FP), dst MOVQ ks_base+8(FP), KS MOVQ ks_len+16(FP), NR SHRQ $2, NR DECQ NR MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY // Encrypt block 0, with the AES key to generate the hash key H MOVOU (16*0)(KS), B0 MOVOU (16*1)(KS), T0 AESENC T0, B0 MOVOU (16*2)(KS), T0 AESENC T0, B0 MOVOU (16*3)(KS), T0 AESENC T0, B0 MOVOU (16*4)(KS), T0 AESENC T0, B0 MOVOU (16*5)(KS), T0 AESENC T0, B0 MOVOU (16*6)(KS), T0 AESENC T0, B0 MOVOU (16*7)(KS), T0 AESENC T0, B0 MOVOU (16*8)(KS), T0 AESENC T0, B0 MOVOU (16*9)(KS), T0 AESENC T0, B0 MOVOU (16*10)(KS), T0 CMPQ NR, $12 JB initEncLast AESENC T0, B0 MOVOU (16*11)(KS), T0 AESENC T0, B0 MOVOU (16*12)(KS), T0 JE initEncLast AESENC T0, B0 MOVOU (16*13)(KS), T0 AESENC T0, B0 MOVOU (16*14)(KS), T0 initEncLast: AESENCLAST T0, B0 PSHUFB BSWAP, B0 // H * 2 PSHUFD $0xff, B0, T0 MOVOU B0, T1 PSRAL $31, T0 PAND POLY, T0 PSRLL $31, T1 PSLLDQ $4, T1 PSLLL $1, B0 PXOR T0, B0 PXOR T1, B0 // Karatsuba pre-computations MOVOU B0, (16*14)(dst) PSHUFD $78, B0, B1 PXOR B0, B1 MOVOU B1, (16*15)(dst) MOVOU B0, B2 MOVOU B1, B3 // Now prepare powers of H and pre-computations for them MOVQ $7, AX initLoop: MOVOU B2, T0 MOVOU B2, T1 MOVOU B3, T2 PCLMULQDQ $0x00, B0, T0 PCLMULQDQ $0x11, B0, T1 PCLMULQDQ $0x00, B1, T2 PXOR T0, T2 PXOR T1, T2 MOVOU T2, B4 PSLLDQ $8, B4 PSRLDQ $8, T2 PXOR B4, T0 PXOR T2, T1 MOVOU POLY, B2 PCLMULQDQ $0x01, T0, B2 PSHUFD $78, T0, T0 PXOR B2, T0 MOVOU POLY, B2 PCLMULQDQ $0x01, T0, B2 PSHUFD $78, T0, T0 PXOR T0, B2 PXOR T1, B2 MOVOU B2, (16*12)(dst) PSHUFD $78, B2, B3 PXOR B2, B3 MOVOU B3, (16*13)(dst) DECQ AX LEAQ (-16*2)(dst), dst JNE initLoop RET #undef NR #undef KS #undef dst // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) TEXT ·gcmAesData(SB),NOSPLIT,$0 #define pTbl DI #define aut SI #define tPtr CX #define autLen DX #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a #define mulRoundAAD(X ,i) \ MOVOU (16*(i*2))(pTbl), T1;\ MOVOU T1, T2;\ PCLMULQDQ $0x00, X, T1;\ PXOR T1, ACC0;\ PCLMULQDQ $0x11, X, T2;\ PXOR T2, ACC1;\ PSHUFD $78, X, T1;\ PXOR T1, X;\ MOVOU (16*(i*2+1))(pTbl), T1;\ PCLMULQDQ $0x00, X, T1;\ PXOR T1, ACCM MOVQ productTable+0(FP), pTbl MOVQ data_base+8(FP), aut MOVQ data_len+16(FP), autLen MOVQ T+32(FP), tPtr PXOR ACC0, ACC0 MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY TESTQ autLen, autLen JEQ dataBail CMPQ autLen, $13 // optimize the TLS case JE dataTLS CMPQ autLen, $128 JB startSinglesLoop JMP dataOctaLoop dataTLS: MOVOU (16*14)(pTbl), T1 MOVOU (16*15)(pTbl), T2 PXOR B0, B0 MOVQ (aut), B0 PINSRD $2, 8(aut), B0 PINSRB $12, 12(aut), B0 XORQ autLen, autLen JMP dataMul dataOctaLoop: CMPQ autLen, $128 JB startSinglesLoop SUBQ $128, autLen MOVOU (16*0)(aut), X0 MOVOU (16*1)(aut), X1 MOVOU (16*2)(aut), X2 MOVOU (16*3)(aut), X3 MOVOU (16*4)(aut), X4 MOVOU (16*5)(aut), X5 MOVOU (16*6)(aut), X6 MOVOU (16*7)(aut), X7 LEAQ (16*8)(aut), aut PSHUFB BSWAP, X0 PSHUFB BSWAP, X1 PSHUFB BSWAP, X2 PSHUFB BSWAP, X3 PSHUFB BSWAP, X4 PSHUFB BSWAP, X5 PSHUFB BSWAP, X6 PSHUFB BSWAP, X7 PXOR ACC0, X0 MOVOU (16*0)(pTbl), ACC0 MOVOU (16*1)(pTbl), ACCM MOVOU ACC0, ACC1 PSHUFD $78, X0, T1 PXOR X0, T1 PCLMULQDQ $0x00, X0, ACC0 PCLMULQDQ $0x11, X0, ACC1 PCLMULQDQ $0x00, T1, ACCM mulRoundAAD(X1, 1) mulRoundAAD(X2, 2) mulRoundAAD(X3, 3) mulRoundAAD(X4, 4) mulRoundAAD(X5, 5) mulRoundAAD(X6, 6) mulRoundAAD(X7, 7) PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 JMP dataOctaLoop startSinglesLoop: MOVOU (16*14)(pTbl), T1 MOVOU (16*15)(pTbl), T2 dataSinglesLoop: CMPQ autLen, $16 JB dataEnd SUBQ $16, autLen MOVOU (aut), B0 dataMul: PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU T1, ACC0 MOVOU T2, ACCM MOVOU T1, ACC1 PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 MOVOU POLY, T0 PCLMULQDQ $0x01, ACC0, T0 PSHUFD $78, ACC0, ACC0 PXOR T0, ACC0 MOVOU POLY, T0 PCLMULQDQ $0x01, ACC0, T0 PSHUFD $78, ACC0, ACC0 PXOR T0, ACC0 PXOR ACC1, ACC0 LEAQ 16(aut), aut JMP dataSinglesLoop dataEnd: TESTQ autLen, autLen JEQ dataBail PXOR B0, B0 LEAQ -1(aut)(autLen*1), aut dataLoadLoop: PSLLDQ $1, B0 PINSRB $0, (aut), B0 LEAQ -1(aut), aut DECQ autLen JNE dataLoadLoop JMP dataMul dataBail: MOVOU ACC0, (tPtr) RET #undef pTbl #undef aut #undef tPtr #undef autLen // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) TEXT ·gcmAesEnc(SB),0,$256-96 #define pTbl DI #define ctx DX #define ctrPtr CX #define ptx SI #define ks AX #define tPtr R8 #define ptxLen R9 #define aluCTR R10 #define aluTMP R11 #define aluK R12 #define NR R13 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 #define combinedRound(i) \ MOVOU (16*i)(ks), T0;\ AESENC T0, B0;\ AESENC T0, B1;\ AESENC T0, B2;\ AESENC T0, B3;\ MOVOU (16*(i*2))(pTbl), T1;\ MOVOU T1, T2;\ AESENC T0, B4;\ AESENC T0, B5;\ AESENC T0, B6;\ AESENC T0, B7;\ MOVOU (16*i)(SP), T0;\ PCLMULQDQ $0x00, T0, T1;\ PXOR T1, ACC0;\ PSHUFD $78, T0, T1;\ PCLMULQDQ $0x11, T0, T2;\ PXOR T1, T0;\ PXOR T2, ACC1;\ MOVOU (16*(i*2+1))(pTbl), T2;\ PCLMULQDQ $0x00, T2, T0;\ PXOR T0, ACCM #define mulRound(i) \ MOVOU (16*i)(SP), T0;\ MOVOU (16*(i*2))(pTbl), T1;\ MOVOU T1, T2;\ PCLMULQDQ $0x00, T0, T1;\ PXOR T1, ACC0;\ PCLMULQDQ $0x11, T0, T2;\ PXOR T2, ACC1;\ PSHUFD $78, T0, T1;\ PXOR T1, T0;\ MOVOU (16*(i*2+1))(pTbl), T1;\ PCLMULQDQ $0x00, T0, T1;\ PXOR T1, ACCM MOVQ productTable+0(FP), pTbl MOVQ dst+8(FP), ctx MOVQ src_base+32(FP), ptx MOVQ src_len+40(FP), ptxLen MOVQ ctr+56(FP), ctrPtr MOVQ T+64(FP), tPtr MOVQ ks_base+72(FP), ks MOVQ ks_len+80(FP), NR SHRQ $2, NR DECQ NR MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 PXOR ACC1, ACC1 PXOR ACCM, ACCM MOVOU (ctrPtr), B0 MOVL (3*4)(ctrPtr), aluCTR MOVOU (ks), T0 MOVL (3*4)(ks), aluK BSWAPL aluCTR BSWAPL aluK PXOR B0, T0 MOVOU T0, (8*16 + 0*16)(SP) increment(0) CMPQ ptxLen, $128 JB gcmAesEncSingles SUBQ $128, ptxLen // We have at least 8 blocks to encrypt, prepare the rest of the counters MOVOU T0, (8*16 + 1*16)(SP) increment(1) MOVOU T0, (8*16 + 2*16)(SP) increment(2) MOVOU T0, (8*16 + 3*16)(SP) increment(3) MOVOU T0, (8*16 + 4*16)(SP) increment(4) MOVOU T0, (8*16 + 5*16)(SP) increment(5) MOVOU T0, (8*16 + 6*16)(SP) increment(6) MOVOU T0, (8*16 + 7*16)(SP) increment(7) MOVOU (8*16 + 0*16)(SP), B0 MOVOU (8*16 + 1*16)(SP), B1 MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 4*16)(SP), B4 MOVOU (8*16 + 5*16)(SP), B5 MOVOU (8*16 + 6*16)(SP), B6 MOVOU (8*16 + 7*16)(SP), B7 aesRound(1) increment(0) aesRound(2) increment(1) aesRound(3) increment(2) aesRound(4) increment(3) aesRound(5) increment(4) aesRound(6) increment(5) aesRound(7) increment(6) aesRound(8) increment(7) aesRound(9) MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB encLast1 aesRnd(T0) aesRound(11) MOVOU (16*12)(ks), T0 JE encLast1 aesRnd(T0) aesRound(13) MOVOU (16*14)(ks), T0 encLast1: aesRndLast(T0) MOVOU (16*0)(ptx), T0 PXOR T0, B0 MOVOU (16*1)(ptx), T0 PXOR T0, B1 MOVOU (16*2)(ptx), T0 PXOR T0, B2 MOVOU (16*3)(ptx), T0 PXOR T0, B3 MOVOU (16*4)(ptx), T0 PXOR T0, B4 MOVOU (16*5)(ptx), T0 PXOR T0, B5 MOVOU (16*6)(ptx), T0 PXOR T0, B6 MOVOU (16*7)(ptx), T0 PXOR T0, B7 MOVOU B0, (16*0)(ctx) PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU B1, (16*1)(ctx) PSHUFB BSWAP, B1 MOVOU B2, (16*2)(ctx) PSHUFB BSWAP, B2 MOVOU B3, (16*3)(ctx) PSHUFB BSWAP, B3 MOVOU B4, (16*4)(ctx) PSHUFB BSWAP, B4 MOVOU B5, (16*5)(ctx) PSHUFB BSWAP, B5 MOVOU B6, (16*6)(ctx) PSHUFB BSWAP, B6 MOVOU B7, (16*7)(ctx) PSHUFB BSWAP, B7 MOVOU B0, (16*0)(SP) MOVOU B1, (16*1)(SP) MOVOU B2, (16*2)(SP) MOVOU B3, (16*3)(SP) MOVOU B4, (16*4)(SP) MOVOU B5, (16*5)(SP) MOVOU B6, (16*6)(SP) MOVOU B7, (16*7)(SP) LEAQ 128(ptx), ptx LEAQ 128(ctx), ctx gcmAesEncOctetsLoop: CMPQ ptxLen, $128 JB gcmAesEncOctetsEnd SUBQ $128, ptxLen MOVOU (8*16 + 0*16)(SP), B0 MOVOU (8*16 + 1*16)(SP), B1 MOVOU (8*16 + 2*16)(SP), B2 MOVOU (8*16 + 3*16)(SP), B3 MOVOU (8*16 + 4*16)(SP), B4 MOVOU (8*16 + 5*16)(SP), B5 MOVOU (8*16 + 6*16)(SP), B6 MOVOU (8*16 + 7*16)(SP), B7 MOVOU (16*0)(SP), T0 PSHUFD $78, T0, T1 PXOR T0, T1 MOVOU (16*0)(pTbl), ACC0 MOVOU (16*1)(pTbl), ACCM MOVOU ACC0, ACC1 PCLMULQDQ $0x00, T1, ACCM PCLMULQDQ $0x00, T0, ACC0 PCLMULQDQ $0x11, T0, ACC1 combinedRound(1) increment(0) combinedRound(2) increment(1) combinedRound(3) increment(2) combinedRound(4) increment(3) combinedRound(5) increment(4) combinedRound(6) increment(5) combinedRound(7) increment(6) aesRound(8) increment(7) PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) aesRound(9) reduceRound(ACC0) PXOR ACC1, ACC0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB encLast2 aesRnd(T0) aesRound(11) MOVOU (16*12)(ks), T0 JE encLast2 aesRnd(T0) aesRound(13) MOVOU (16*14)(ks), T0 encLast2: aesRndLast(T0) MOVOU (16*0)(ptx), T0 PXOR T0, B0 MOVOU (16*1)(ptx), T0 PXOR T0, B1 MOVOU (16*2)(ptx), T0 PXOR T0, B2 MOVOU (16*3)(ptx), T0 PXOR T0, B3 MOVOU (16*4)(ptx), T0 PXOR T0, B4 MOVOU (16*5)(ptx), T0 PXOR T0, B5 MOVOU (16*6)(ptx), T0 PXOR T0, B6 MOVOU (16*7)(ptx), T0 PXOR T0, B7 MOVOU B0, (16*0)(ctx) PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU B1, (16*1)(ctx) PSHUFB BSWAP, B1 MOVOU B2, (16*2)(ctx) PSHUFB BSWAP, B2 MOVOU B3, (16*3)(ctx) PSHUFB BSWAP, B3 MOVOU B4, (16*4)(ctx) PSHUFB BSWAP, B4 MOVOU B5, (16*5)(ctx) PSHUFB BSWAP, B5 MOVOU B6, (16*6)(ctx) PSHUFB BSWAP, B6 MOVOU B7, (16*7)(ctx) PSHUFB BSWAP, B7 MOVOU B0, (16*0)(SP) MOVOU B1, (16*1)(SP) MOVOU B2, (16*2)(SP) MOVOU B3, (16*3)(SP) MOVOU B4, (16*4)(SP) MOVOU B5, (16*5)(SP) MOVOU B6, (16*6)(SP) MOVOU B7, (16*7)(SP) LEAQ 128(ptx), ptx LEAQ 128(ctx), ctx JMP gcmAesEncOctetsLoop gcmAesEncOctetsEnd: MOVOU (16*0)(SP), T0 MOVOU (16*0)(pTbl), ACC0 MOVOU (16*1)(pTbl), ACCM MOVOU ACC0, ACC1 PSHUFD $78, T0, T1 PXOR T0, T1 PCLMULQDQ $0x00, T0, ACC0 PCLMULQDQ $0x11, T0, ACC1 PCLMULQDQ $0x00, T1, ACCM mulRound(1) mulRound(2) mulRound(3) mulRound(4) mulRound(5) mulRound(6) mulRound(7) PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 TESTQ ptxLen, ptxLen JE gcmAesEncDone SUBQ $7, aluCTR gcmAesEncSingles: MOVOU (16*1)(ks), B1 MOVOU (16*2)(ks), B2 MOVOU (16*3)(ks), B3 MOVOU (16*4)(ks), B4 MOVOU (16*5)(ks), B5 MOVOU (16*6)(ks), B6 MOVOU (16*7)(ks), B7 MOVOU (16*14)(pTbl), T2 gcmAesEncSinglesLoop: CMPQ ptxLen, $16 JB gcmAesEncTail SUBQ $16, ptxLen MOVOU (8*16 + 0*16)(SP), B0 increment(0) AESENC B1, B0 AESENC B2, B0 AESENC B3, B0 AESENC B4, B0 AESENC B5, B0 AESENC B6, B0 AESENC B7, B0 MOVOU (16*8)(ks), T0 AESENC T0, B0 MOVOU (16*9)(ks), T0 AESENC T0, B0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB encLast3 AESENC T0, B0 MOVOU (16*11)(ks), T0 AESENC T0, B0 MOVOU (16*12)(ks), T0 JE encLast3 AESENC T0, B0 MOVOU (16*13)(ks), T0 AESENC T0, B0 MOVOU (16*14)(ks), T0 encLast3: AESENCLAST T0, B0 MOVOU (ptx), T0 PXOR T0, B0 MOVOU B0, (ctx) PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU T2, ACC0 MOVOU T2, ACC1 MOVOU (16*15)(pTbl), ACCM PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 LEAQ (16*1)(ptx), ptx LEAQ (16*1)(ctx), ctx JMP gcmAesEncSinglesLoop gcmAesEncTail: TESTQ ptxLen, ptxLen JE gcmAesEncDone MOVOU (8*16 + 0*16)(SP), B0 AESENC B1, B0 AESENC B2, B0 AESENC B3, B0 AESENC B4, B0 AESENC B5, B0 AESENC B6, B0 AESENC B7, B0 MOVOU (16*8)(ks), T0 AESENC T0, B0 MOVOU (16*9)(ks), T0 AESENC T0, B0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB encLast4 AESENC T0, B0 MOVOU (16*11)(ks), T0 AESENC T0, B0 MOVOU (16*12)(ks), T0 JE encLast4 AESENC T0, B0 MOVOU (16*13)(ks), T0 AESENC T0, B0 MOVOU (16*14)(ks), T0 encLast4: AESENCLAST T0, B0 MOVOU B0, T0 LEAQ -1(ptx)(ptxLen*1), ptx MOVQ ptxLen, aluTMP SHLQ $4, aluTMP LEAQ andMask<>(SB), aluCTR MOVOU -16(aluCTR)(aluTMP*1), T1 PXOR B0, B0 ptxLoadLoop: PSLLDQ $1, B0 PINSRB $0, (ptx), B0 LEAQ -1(ptx), ptx DECQ ptxLen JNE ptxLoadLoop PXOR T0, B0 PAND T1, B0 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU T2, ACC0 MOVOU T2, ACC1 MOVOU (16*15)(pTbl), ACCM PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 gcmAesEncDone: MOVOU ACC0, (tPtr) RET #undef increment // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) TEXT ·gcmAesDec(SB),0,$128-96 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) #define combinedDecRound(i) \ MOVOU (16*i)(ks), T0;\ AESENC T0, B0;\ AESENC T0, B1;\ AESENC T0, B2;\ AESENC T0, B3;\ MOVOU (16*(i*2))(pTbl), T1;\ MOVOU T1, T2;\ AESENC T0, B4;\ AESENC T0, B5;\ AESENC T0, B6;\ AESENC T0, B7;\ MOVOU (16*i)(ctx), T0;\ PSHUFB BSWAP, T0;\ PCLMULQDQ $0x00, T0, T1;\ PXOR T1, ACC0;\ PSHUFD $78, T0, T1;\ PCLMULQDQ $0x11, T0, T2;\ PXOR T1, T0;\ PXOR T2, ACC1;\ MOVOU (16*(i*2+1))(pTbl), T2;\ PCLMULQDQ $0x00, T2, T0;\ PXOR T0, ACCM MOVQ productTable+0(FP), pTbl MOVQ dst+8(FP), ptx MOVQ src_base+32(FP), ctx MOVQ src_len+40(FP), ptxLen MOVQ ctr+56(FP), ctrPtr MOVQ T+64(FP), tPtr MOVQ ks_base+72(FP), ks MOVQ ks_len+80(FP), NR SHRQ $2, NR DECQ NR MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 PXOR ACC1, ACC1 PXOR ACCM, ACCM MOVOU (ctrPtr), B0 MOVL (3*4)(ctrPtr), aluCTR MOVOU (ks), T0 MOVL (3*4)(ks), aluK BSWAPL aluCTR BSWAPL aluK PXOR B0, T0 MOVOU T0, (0*16)(SP) increment(0) CMPQ ptxLen, $128 JB gcmAesDecSingles MOVOU T0, (1*16)(SP) increment(1) MOVOU T0, (2*16)(SP) increment(2) MOVOU T0, (3*16)(SP) increment(3) MOVOU T0, (4*16)(SP) increment(4) MOVOU T0, (5*16)(SP) increment(5) MOVOU T0, (6*16)(SP) increment(6) MOVOU T0, (7*16)(SP) increment(7) gcmAesDecOctetsLoop: CMPQ ptxLen, $128 JB gcmAesDecEndOctets SUBQ $128, ptxLen MOVOU (0*16)(SP), B0 MOVOU (1*16)(SP), B1 MOVOU (2*16)(SP), B2 MOVOU (3*16)(SP), B3 MOVOU (4*16)(SP), B4 MOVOU (5*16)(SP), B5 MOVOU (6*16)(SP), B6 MOVOU (7*16)(SP), B7 MOVOU (16*0)(ctx), T0 PSHUFB BSWAP, T0 PXOR ACC0, T0 PSHUFD $78, T0, T1 PXOR T0, T1 MOVOU (16*0)(pTbl), ACC0 MOVOU (16*1)(pTbl), ACCM MOVOU ACC0, ACC1 PCLMULQDQ $0x00, T1, ACCM PCLMULQDQ $0x00, T0, ACC0 PCLMULQDQ $0x11, T0, ACC1 combinedDecRound(1) increment(0) combinedDecRound(2) increment(1) combinedDecRound(3) increment(2) combinedDecRound(4) increment(3) combinedDecRound(5) increment(4) combinedDecRound(6) increment(5) combinedDecRound(7) increment(6) aesRound(8) increment(7) PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) aesRound(9) reduceRound(ACC0) PXOR ACC1, ACC0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB decLast1 aesRnd(T0) aesRound(11) MOVOU (16*12)(ks), T0 JE decLast1 aesRnd(T0) aesRound(13) MOVOU (16*14)(ks), T0 decLast1: aesRndLast(T0) MOVOU (16*0)(ctx), T0 PXOR T0, B0 MOVOU (16*1)(ctx), T0 PXOR T0, B1 MOVOU (16*2)(ctx), T0 PXOR T0, B2 MOVOU (16*3)(ctx), T0 PXOR T0, B3 MOVOU (16*4)(ctx), T0 PXOR T0, B4 MOVOU (16*5)(ctx), T0 PXOR T0, B5 MOVOU (16*6)(ctx), T0 PXOR T0, B6 MOVOU (16*7)(ctx), T0 PXOR T0, B7 MOVOU B0, (16*0)(ptx) MOVOU B1, (16*1)(ptx) MOVOU B2, (16*2)(ptx) MOVOU B3, (16*3)(ptx) MOVOU B4, (16*4)(ptx) MOVOU B5, (16*5)(ptx) MOVOU B6, (16*6)(ptx) MOVOU B7, (16*7)(ptx) LEAQ 128(ptx), ptx LEAQ 128(ctx), ctx JMP gcmAesDecOctetsLoop gcmAesDecEndOctets: SUBQ $7, aluCTR gcmAesDecSingles: MOVOU (16*1)(ks), B1 MOVOU (16*2)(ks), B2 MOVOU (16*3)(ks), B3 MOVOU (16*4)(ks), B4 MOVOU (16*5)(ks), B5 MOVOU (16*6)(ks), B6 MOVOU (16*7)(ks), B7 MOVOU (16*14)(pTbl), T2 gcmAesDecSinglesLoop: CMPQ ptxLen, $16 JB gcmAesDecTail SUBQ $16, ptxLen MOVOU (ctx), B0 MOVOU B0, T1 PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU T2, ACC0 MOVOU T2, ACC1 MOVOU (16*15)(pTbl), ACCM PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 MOVOU (0*16)(SP), B0 increment(0) AESENC B1, B0 AESENC B2, B0 AESENC B3, B0 AESENC B4, B0 AESENC B5, B0 AESENC B6, B0 AESENC B7, B0 MOVOU (16*8)(ks), T0 AESENC T0, B0 MOVOU (16*9)(ks), T0 AESENC T0, B0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB decLast2 AESENC T0, B0 MOVOU (16*11)(ks), T0 AESENC T0, B0 MOVOU (16*12)(ks), T0 JE decLast2 AESENC T0, B0 MOVOU (16*13)(ks), T0 AESENC T0, B0 MOVOU (16*14)(ks), T0 decLast2: AESENCLAST T0, B0 PXOR T1, B0 MOVOU B0, (ptx) LEAQ (16*1)(ptx), ptx LEAQ (16*1)(ctx), ctx JMP gcmAesDecSinglesLoop gcmAesDecTail: TESTQ ptxLen, ptxLen JE gcmAesDecDone MOVQ ptxLen, aluTMP SHLQ $4, aluTMP LEAQ andMask<>(SB), aluCTR MOVOU -16(aluCTR)(aluTMP*1), T1 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow PAND T1, B0 MOVOU B0, T1 PSHUFB BSWAP, B0 PXOR ACC0, B0 MOVOU (16*14)(pTbl), ACC0 MOVOU (16*15)(pTbl), ACCM MOVOU ACC0, ACC1 PCLMULQDQ $0x00, B0, ACC0 PCLMULQDQ $0x11, B0, ACC1 PSHUFD $78, B0, T0 PXOR B0, T0 PCLMULQDQ $0x00, T0, ACCM PXOR ACC0, ACCM PXOR ACC1, ACCM MOVOU ACCM, T0 PSRLDQ $8, ACCM PSLLDQ $8, T0 PXOR ACCM, ACC1 PXOR T0, ACC0 reduceRound(ACC0) reduceRound(ACC0) PXOR ACC1, ACC0 MOVOU (0*16)(SP), B0 increment(0) AESENC B1, B0 AESENC B2, B0 AESENC B3, B0 AESENC B4, B0 AESENC B5, B0 AESENC B6, B0 AESENC B7, B0 MOVOU (16*8)(ks), T0 AESENC T0, B0 MOVOU (16*9)(ks), T0 AESENC T0, B0 MOVOU (16*10)(ks), T0 CMPQ NR, $12 JB decLast3 AESENC T0, B0 MOVOU (16*11)(ks), T0 AESENC T0, B0 MOVOU (16*12)(ks), T0 JE decLast3 AESENC T0, B0 MOVOU (16*13)(ks), T0 AESENC T0, B0 MOVOU (16*14)(ks), T0 decLast3: AESENCLAST T0, B0 PXOR T1, B0 ptxStoreLoop: PEXTRB $0, B0, (ptx) PSRLDQ $1, B0 LEAQ 1(ptx), ptx DECQ ptxLen JNE ptxStoreLoop gcmAesDecDone: MOVOU ACC0, (tPtr) RET