1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "asm_amd64.h"
7 #include "textflag.h"
8
9 TEXT ·Count(SB),NOSPLIT,$0-40
10 #ifndef hasPOPCNT
11 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
12 JEQ 2(PC)
13 JMP ·countGeneric(SB)
14 #endif
15 MOVQ b_base+0(FP), SI
16 MOVQ b_len+8(FP), BX
17 MOVB c+24(FP), AL
18 LEAQ ret+32(FP), R8
19 JMP countbody<>(SB)
20
21 TEXT ·CountString(SB),NOSPLIT,$0-32
22 #ifndef hasPOPCNT
23 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
24 JEQ 2(PC)
25 JMP ·countGenericString(SB)
26 #endif
27 MOVQ s_base+0(FP), SI
28 MOVQ s_len+8(FP), BX
29 MOVB c+16(FP), AL
30 LEAQ ret+24(FP), R8
31 JMP countbody<>(SB)
32
33 // input:
34 // SI: data
35 // BX: data len
36 // AL: byte sought
37 // R8: address to put result
38 // This function requires the POPCNT instruction.
39 TEXT countbody<>(SB),NOSPLIT,$0
40 // Shuffle X0 around so that each byte contains
41 // the character we're looking for.
42 MOVD AX, X0
43 PUNPCKLBW X0, X0
44 PUNPCKLBW X0, X0
45 PSHUFL $0, X0, X0
46
47 CMPQ BX, $16
48 JLT small
49
50 MOVQ $0, R12 // Accumulator
51
52 MOVQ SI, DI
53
54 CMPQ BX, $32
55 JA avx2
56 sse:
57 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
58 JMP sseloopentry
59
60 sseloop:
61 // Move the next 16-byte chunk of the data into X1.
62 MOVOU (DI), X1
63 // Compare bytes in X0 to X1.
64 PCMPEQB X0, X1
65 // Take the top bit of each byte in X1 and put the result in DX.
66 PMOVMSKB X1, DX
67 // Count number of matching bytes
68 POPCNTL DX, DX
69 // Accumulate into R12
70 ADDQ DX, R12
71 // Advance to next block.
72 ADDQ $16, DI
73 sseloopentry:
74 CMPQ DI, AX
75 JBE sseloop
76
77 // Get the number of bytes to consider in the last 16 bytes
78 ANDQ $15, BX
79 JZ end
80
81 // Create mask to ignore overlap between previous 16 byte block
82 // and the next.
83 MOVQ $16,CX
84 SUBQ BX, CX
85 MOVQ $0xFFFF, R10
86 SARQ CL, R10
87 SALQ CL, R10
88
89 // Process the last 16-byte chunk. This chunk may overlap with the
90 // chunks we've already searched so we need to mask part of it.
91 MOVOU (AX), X1
92 PCMPEQB X0, X1
93 PMOVMSKB X1, DX
94 // Apply mask
95 ANDQ R10, DX
96 POPCNTL DX, DX
97 ADDQ DX, R12
98 end:
99 MOVQ R12, (R8)
100 RET
101
102 // handle for lengths < 16
103 small:
104 TESTQ BX, BX
105 JEQ endzero
106
107 // Check if we'll load across a page boundary.
108 LEAQ 16(SI), AX
109 TESTW $0xff0, AX
110 JEQ endofpage
111
112 // We must ignore high bytes as they aren't part of our slice.
113 // Create mask.
114 MOVB BX, CX
115 MOVQ $1, R10
116 SALQ CL, R10
117 SUBQ $1, R10
118
119 // Load data
120 MOVOU (SI), X1
121 // Compare target byte with each byte in data.
122 PCMPEQB X0, X1
123 // Move result bits to integer register.
124 PMOVMSKB X1, DX
125 // Apply mask
126 ANDQ R10, DX
127 POPCNTL DX, DX
128 // Directly return DX, we don't need to accumulate
129 // since we have <16 bytes.
130 MOVQ DX, (R8)
131 RET
132 endzero:
133 MOVQ $0, (R8)
134 RET
135
136 endofpage:
137 // We must ignore low bytes as they aren't part of our slice.
138 MOVQ $16,CX
139 SUBQ BX, CX
140 MOVQ $0xFFFF, R10
141 SARQ CL, R10
142 SALQ CL, R10
143
144 // Load data into the high end of X1.
145 MOVOU -16(SI)(BX*1), X1
146 // Compare target byte with each byte in data.
147 PCMPEQB X0, X1
148 // Move result bits to integer register.
149 PMOVMSKB X1, DX
150 // Apply mask
151 ANDQ R10, DX
152 // Directly return DX, we don't need to accumulate
153 // since we have <16 bytes.
154 POPCNTL DX, DX
155 MOVQ DX, (R8)
156 RET
157
158 avx2:
159 #ifndef hasAVX2
160 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
161 JNE sse
162 #endif
163 MOVD AX, X0
164 LEAQ -32(SI)(BX*1), R11
165 VPBROADCASTB X0, Y1
166 avx2_loop:
167 VMOVDQU (DI), Y2
168 VPCMPEQB Y1, Y2, Y3
169 VPMOVMSKB Y3, DX
170 POPCNTL DX, DX
171 ADDQ DX, R12
172 ADDQ $32, DI
173 CMPQ DI, R11
174 JLE avx2_loop
175
176 // If last block is already processed,
177 // skip to the end.
178 CMPQ DI, R11
179 JEQ endavx
180
181 // Load address of the last 32 bytes.
182 // There is an overlap with the previous block.
183 MOVQ R11, DI
184 VMOVDQU (DI), Y2
185 VPCMPEQB Y1, Y2, Y3
186 VPMOVMSKB Y3, DX
187 // Exit AVX mode.
188 VZEROUPPER
189
190 // Create mask to ignore overlap between previous 32 byte block
191 // and the next.
192 ANDQ $31, BX
193 MOVQ $32,CX
194 SUBQ BX, CX
195 MOVQ $0xFFFFFFFF, R10
196 SARQ CL, R10
197 SALQ CL, R10
198 // Apply mask
199 ANDQ R10, DX
200 POPCNTL DX, DX
201 ADDQ DX, R12
202 MOVQ R12, (R8)
203 RET
204 endavx:
205 // Exit AVX mode.
206 VZEROUPPER
207 MOVQ R12, (R8)
208 RET
209
View as plain text