// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ppc64 || ppc64le #include "go_asm.h" #include "textflag.h" TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 // R3 = byte array pointer // R4 = length MOVD R6, R5 // R5 = byte BR indexbytebody<>(SB) TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 // R3 = string // R4 = length // R5 = byte BR indexbytebody<>(SB) #ifndef GOPPC64_power9 #ifdef GOARCH_ppc64le DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 #else DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 #endif GLOBL indexbytevbperm<>+0(SB), RODATA, $16 #endif // Some operations are endian specific, choose the correct opcode base on GOARCH. // Note, _VCZBEBB is only available on power9 and newer. #ifdef GOARCH_ppc64le #define _LDBEX MOVDBR #define _LWBEX MOVWBR #define _LHBEX MOVHBR #define _VCZBEBB VCTZLSBB #else #define _LDBEX MOVD #define _LWBEX MOVW #define _LHBEX MOVH #define _VCZBEBB VCLZLSBB #endif // R3 = addr of string // R4 = len of string // R5 = byte to find // On exit: // R3 = return value TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 CMPU R4,$32 #ifndef GOPPC64_power9 // Load VBPERMQ constant to reduce compare into an ordered bit mask. MOVD $indexbytevbperm<>+00(SB),R16 LXVD2X (R16),V0 // Set up swap string #endif MTVRD R5,V1 VSPLTB $7,V1,V1 // Replicate byte across V1 BLT cmp16 // Jump to the small string case if it's <32 bytes. CMP R4,$64,CR1 MOVD $16,R11 MOVD R3,R8 BLT CR1,cmp32 // Special case for length 32 - 63 MOVD $32,R12 MOVD $48,R6 RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63 ADD R3,R9,R9 // R9 = &s[len &^ 63] ANDCC $63,R4 // (len &= 63) cmp 0. PCALIGN $16 loop64: LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0] VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 // Match found at R8, jump out LXVD2X (R11)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out LXVD2X (R12)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat2 // Match found at R8+32 bytes, jump out LXVD2X (R6)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat3 // Match found at R8+48 bytes, jump out ADD $64,R8 CMPU R8,R9,CR1 BNE CR1,loop64 // R8 != &s[len &^ 63]? PCALIGN $32 BEQ notfound // Is tail length 0? CR0 is set before entering loop64. CMP R4,$32 // Tail length >= 32, use cmp32 path. CMP R4,$16,CR1 BGE cmp32 ADD R8,R4,R9 ADD $-16,R9 BLE CR1,cmp64_tail_gt0 cmp64_tail_gt16: // Tail length 17 - 32 LXVD2X (R0)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 cmp64_tail_gt0: // Tail length 1 - 16 MOVD R9,R8 LXVD2X (R0)(R9),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 BR notfound cmp32: // Length 32 - 63 // Bytes 0 - 15 LXVD2X (R0)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 // Bytes 16 - 31 LXVD2X (R8)(R11),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32) CMP R4,$48 ADD R4,R8,R9 // Compute &s[len(s)-16] ADD $32,R8,R8 ADD $-16,R9,R9 ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8 // Bytes 33 - 47 LXVD2X (R0)(R8),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 // match found at R8+32 bytes, jump out BLE notfound // Bytes 48 - 63 MOVD R9,R8 // R9 holds the final check. LXVD2X (R0)(R9),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out BR notfound // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. #ifndef GOPPC64_power9 #define ADJUST_FOR_CNTLZW -16 #else #define ADJUST_FOR_CNTLZW 0 #endif // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used // to determine the offset into the 16B vector, it will overcount by 16. Account for it here. foundat3: SUB R3,R8,R3 ADD $48+ADJUST_FOR_CNTLZW,R3 BR vfound foundat2: SUB R3,R8,R3 ADD $32+ADJUST_FOR_CNTLZW,R3 BR vfound foundat1: SUB R3,R8,R3 ADD $16+ADJUST_FOR_CNTLZW,R3 BR vfound foundat0: SUB R3,R8,R3 ADD $0+ADJUST_FOR_CNTLZW,R3 vfound: // Map equal values into a 16 bit value with earlier matches setting higher bits. #ifndef GOPPC64_power9 VBPERMQ V6,V0,V6 MFVRD V6,R4 CNTLZW R4,R4 #else #ifdef GOARCH_ppc64le // Put the value back into LE ordering by swapping doublewords. XXPERMDI V6,V6,$2,V6 #endif _VCZBEBB V6,R4 #endif ADD R3,R4,R3 RET cmp16: // Length 16 - 31 CMPU R4,$16 ADD R4,R3,R9 BLT cmp8 ADD $-16,R9,R9 // &s[len(s)-16] // Bytes 0 - 15 LXVD2X (R0)(R3),V2 VCMPEQUBCC V2,V1,V6 MOVD R3,R8 BNE CR6,foundat0 // Match found at R8+32 bytes, jump out BEQ notfound // Bytes 16 - 30 MOVD R9,R8 // R9 holds the final check. LXVD2X (R0)(R9),V2 VCMPEQUBCC V2,V1,V6 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out BR notfound cmp8: // Length 8 - 15 #ifdef GOPPC64_power10 // Load all the bytes into a single VSR in BE order. SLD $56,R4,R5 LXVLL R3,R5,V2 // Compare and count the number which don't match. VCMPEQUB V2,V1,V6 VCLZLSBB V6,R3 // If count is the number of bytes, or more. No matches are found. CMPU R3,R4 MOVD $-1,R5 // Otherwise, the count is the index of the first match. ISEL CR0LT,R3,R5,R3 RET #else RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. RLDIMI $16,R5,$32,R5 RLDIMI $32,R5,$0,R5 CMPU R4,$8 BLT cmp4 MOVD $-8,R11 ADD $-8,R4,R4 _LDBEX (R0)(R3),R10 _LDBEX (R11)(R9),R11 CMPB R10,R5,R10 CMPB R11,R5,R11 CMPU R10,$0 CMPU R11,$0,CR1 CNTLZD R10,R10 CNTLZD R11,R11 SRD $3,R10,R3 SRD $3,R11,R11 BNE found ADD R4,R11,R4 MOVD $-1,R3 ISEL CR1EQ,R3,R4,R3 RET cmp4: // Length 4 - 7 CMPU R4,$4 BLT cmp2 MOVD $-4,R11 ADD $-4,R4,R4 _LWBEX (R0)(R3),R10 _LWBEX (R11)(R9),R11 CMPB R10,R5,R10 CMPB R11,R5,R11 CNTLZW R10,R10 CNTLZW R11,R11 CMPU R10,$32 CMPU R11,$32,CR1 SRD $3,R10,R3 SRD $3,R11,R11 BNE found ADD R4,R11,R4 MOVD $-1,R3 ISEL CR1EQ,R3,R4,R3 RET cmp2: // Length 2 - 3 CMPU R4,$2 BLT cmp1 _LHBEX (R0)(R3),R10 CMPB R10,R5,R10 SLDCC $48,R10,R10 CNTLZD R10,R10 SRD $3,R10,R3 BNE found cmp1: // Length 1 MOVD $-1,R3 ANDCC $1,R4,R31 BEQ found MOVBZ -1(R9),R10 CMPB R10,R5,R10 ANDCC $1,R10 ADD $-1,R4 ISEL CR0EQ,R3,R4,R3 found: RET #endif notfound: MOVD $-1,R3 RET