Coverage Report

Created: 2025-06-10 13:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/bitcoin/src/crypto/sha256_sse4.cpp
Line
Count
Source
1
// Copyright (c) 2017-2022 The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// This is a translation to GCC extended asm syntax from YASM code by Intel
6
// (available at the bottom of this file).
7
8
#include <cstdlib>
9
#include <stdint.h>
10
11
#if defined(__x86_64__) || defined(__amd64__)
12
13
namespace sha256_sse4
14
{
15
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16
#if defined(__clang__)
17
  /*
18
  clang is unable to compile this with -O0 and -fsanitize=address.
19
  See upstream bug: https://github.com/llvm/llvm-project/issues/92182.
20
  This also fails to compile with -O2, -fcf-protection & -fsanitize=address.
21
  See https://github.com/bitcoin/bitcoin/issues/31913.
22
  */
23
#if __has_feature(address_sanitizer)
24
  __attribute__((no_sanitize("address")))
25
#endif
26
#endif
27
0
{
28
0
    static const uint32_t K256 alignas(16) [] = {
29
0
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
30
0
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
31
0
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
32
0
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
33
0
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
34
0
        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
35
0
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
36
0
        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
37
0
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
38
0
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
39
0
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
40
0
        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
41
0
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
42
0
        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
43
0
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
44
0
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
45
0
    };
46
0
    static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
47
0
    static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
48
0
    static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
49
0
    uint32_t a, b, c, d, f, g, h, y0, y1, y2;
50
0
    uint64_t tbl;
51
0
    uint64_t inp_end, inp;
52
0
    uint32_t xfer alignas(16) [4];
53
54
0
    __asm__ __volatile__(
55
0
        "shl    $0x6,%2;"
56
0
        "je     Ldone_hash_%=;"
57
0
        "add    %1,%2;"
58
0
        "mov    %2,%14;"
59
0
        "mov    (%0),%3;"
60
0
        "mov    0x4(%0),%4;"
61
0
        "mov    0x8(%0),%5;"
62
0
        "mov    0xc(%0),%6;"
63
0
        "mov    0x10(%0),%k2;"
64
0
        "mov    0x14(%0),%7;"
65
0
        "mov    0x18(%0),%8;"
66
0
        "mov    0x1c(%0),%9;"
67
0
        "movdqa %18,%%xmm12;"
68
0
        "movdqa %19,%%xmm10;"
69
0
        "movdqa %20,%%xmm11;"
70
71
0
        "Lloop0_%=:"
72
0
        "lea    %17,%13;"
73
0
        "movdqu (%1),%%xmm4;"
74
0
        "pshufb %%xmm12,%%xmm4;"
75
0
        "movdqu 0x10(%1),%%xmm5;"
76
0
        "pshufb %%xmm12,%%xmm5;"
77
0
        "movdqu 0x20(%1),%%xmm6;"
78
0
        "pshufb %%xmm12,%%xmm6;"
79
0
        "movdqu 0x30(%1),%%xmm7;"
80
0
        "pshufb %%xmm12,%%xmm7;"
81
0
        "mov    %1,%15;"
82
0
        "mov    $3,%1;"
83
84
0
        "Lloop1_%=:"
85
0
        "movdqa 0x0(%13),%%xmm9;"
86
0
        "paddd  %%xmm4,%%xmm9;"
87
0
        "movdqa %%xmm9,%16;"
88
0
        "movdqa %%xmm7,%%xmm0;"
89
0
        "mov    %k2,%10;"
90
0
        "ror    $0xe,%10;"
91
0
        "mov    %3,%11;"
92
0
        "palignr $0x4,%%xmm6,%%xmm0;"
93
0
        "ror    $0x9,%11;"
94
0
        "xor    %k2,%10;"
95
0
        "mov    %7,%12;"
96
0
        "ror    $0x5,%10;"
97
0
        "movdqa %%xmm5,%%xmm1;"
98
0
        "xor    %3,%11;"
99
0
        "xor    %8,%12;"
100
0
        "paddd  %%xmm4,%%xmm0;"
101
0
        "xor    %k2,%10;"
102
0
        "and    %k2,%12;"
103
0
        "ror    $0xb,%11;"
104
0
        "palignr $0x4,%%xmm4,%%xmm1;"
105
0
        "xor    %3,%11;"
106
0
        "ror    $0x6,%10;"
107
0
        "xor    %8,%12;"
108
0
        "movdqa %%xmm1,%%xmm2;"
109
0
        "ror    $0x2,%11;"
110
0
        "add    %10,%12;"
111
0
        "add    %16,%12;"
112
0
        "movdqa %%xmm1,%%xmm3;"
113
0
        "mov    %3,%10;"
114
0
        "add    %12,%9;"
115
0
        "mov    %3,%12;"
116
0
        "pslld  $0x19,%%xmm1;"
117
0
        "or     %5,%10;"
118
0
        "add    %9,%6;"
119
0
        "and    %5,%12;"
120
0
        "psrld  $0x7,%%xmm2;"
121
0
        "and    %4,%10;"
122
0
        "add    %11,%9;"
123
0
        "por    %%xmm2,%%xmm1;"
124
0
        "or     %12,%10;"
125
0
        "add    %10,%9;"
126
0
        "movdqa %%xmm3,%%xmm2;"
127
0
        "mov    %6,%10;"
128
0
        "mov    %9,%11;"
129
0
        "movdqa %%xmm3,%%xmm8;"
130
0
        "ror    $0xe,%10;"
131
0
        "xor    %6,%10;"
132
0
        "mov    %k2,%12;"
133
0
        "ror    $0x9,%11;"
134
0
        "pslld  $0xe,%%xmm3;"
135
0
        "xor    %9,%11;"
136
0
        "ror    $0x5,%10;"
137
0
        "xor    %7,%12;"
138
0
        "psrld  $0x12,%%xmm2;"
139
0
        "ror    $0xb,%11;"
140
0
        "xor    %6,%10;"
141
0
        "and    %6,%12;"
142
0
        "ror    $0x6,%10;"
143
0
        "pxor   %%xmm3,%%xmm1;"
144
0
        "xor    %9,%11;"
145
0
        "xor    %7,%12;"
146
0
        "psrld  $0x3,%%xmm8;"
147
0
        "add    %10,%12;"
148
0
        "add    4+%16,%12;"
149
0
        "ror    $0x2,%11;"
150
0
        "pxor   %%xmm2,%%xmm1;"
151
0
        "mov    %9,%10;"
152
0
        "add    %12,%8;"
153
0
        "mov    %9,%12;"
154
0
        "pxor   %%xmm8,%%xmm1;"
155
0
        "or     %4,%10;"
156
0
        "add    %8,%5;"
157
0
        "and    %4,%12;"
158
0
        "pshufd $0xfa,%%xmm7,%%xmm2;"
159
0
        "and    %3,%10;"
160
0
        "add    %11,%8;"
161
0
        "paddd  %%xmm1,%%xmm0;"
162
0
        "or     %12,%10;"
163
0
        "add    %10,%8;"
164
0
        "movdqa %%xmm2,%%xmm3;"
165
0
        "mov    %5,%10;"
166
0
        "mov    %8,%11;"
167
0
        "ror    $0xe,%10;"
168
0
        "movdqa %%xmm2,%%xmm8;"
169
0
        "xor    %5,%10;"
170
0
        "ror    $0x9,%11;"
171
0
        "mov    %6,%12;"
172
0
        "xor    %8,%11;"
173
0
        "ror    $0x5,%10;"
174
0
        "psrlq  $0x11,%%xmm2;"
175
0
        "xor    %k2,%12;"
176
0
        "psrlq  $0x13,%%xmm3;"
177
0
        "xor    %5,%10;"
178
0
        "and    %5,%12;"
179
0
        "psrld  $0xa,%%xmm8;"
180
0
        "ror    $0xb,%11;"
181
0
        "xor    %8,%11;"
182
0
        "xor    %k2,%12;"
183
0
        "ror    $0x6,%10;"
184
0
        "pxor   %%xmm3,%%xmm2;"
185
0
        "add    %10,%12;"
186
0
        "ror    $0x2,%11;"
187
0
        "add    8+%16,%12;"
188
0
        "pxor   %%xmm2,%%xmm8;"
189
0
        "mov    %8,%10;"
190
0
        "add    %12,%7;"
191
0
        "mov    %8,%12;"
192
0
        "pshufb %%xmm10,%%xmm8;"
193
0
        "or     %3,%10;"
194
0
        "add    %7,%4;"
195
0
        "and    %3,%12;"
196
0
        "paddd  %%xmm8,%%xmm0;"
197
0
        "and    %9,%10;"
198
0
        "add    %11,%7;"
199
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
200
0
        "or     %12,%10;"
201
0
        "add    %10,%7;"
202
0
        "movdqa %%xmm2,%%xmm3;"
203
0
        "mov    %4,%10;"
204
0
        "ror    $0xe,%10;"
205
0
        "mov    %7,%11;"
206
0
        "movdqa %%xmm2,%%xmm4;"
207
0
        "ror    $0x9,%11;"
208
0
        "xor    %4,%10;"
209
0
        "mov    %5,%12;"
210
0
        "ror    $0x5,%10;"
211
0
        "psrlq  $0x11,%%xmm2;"
212
0
        "xor    %7,%11;"
213
0
        "xor    %6,%12;"
214
0
        "psrlq  $0x13,%%xmm3;"
215
0
        "xor    %4,%10;"
216
0
        "and    %4,%12;"
217
0
        "ror    $0xb,%11;"
218
0
        "psrld  $0xa,%%xmm4;"
219
0
        "xor    %7,%11;"
220
0
        "ror    $0x6,%10;"
221
0
        "xor    %6,%12;"
222
0
        "pxor   %%xmm3,%%xmm2;"
223
0
        "ror    $0x2,%11;"
224
0
        "add    %10,%12;"
225
0
        "add    12+%16,%12;"
226
0
        "pxor   %%xmm2,%%xmm4;"
227
0
        "mov    %7,%10;"
228
0
        "add    %12,%k2;"
229
0
        "mov    %7,%12;"
230
0
        "pshufb %%xmm11,%%xmm4;"
231
0
        "or     %9,%10;"
232
0
        "add    %k2,%3;"
233
0
        "and    %9,%12;"
234
0
        "paddd  %%xmm0,%%xmm4;"
235
0
        "and    %8,%10;"
236
0
        "add    %11,%k2;"
237
0
        "or     %12,%10;"
238
0
        "add    %10,%k2;"
239
0
        "movdqa 0x10(%13),%%xmm9;"
240
0
        "paddd  %%xmm5,%%xmm9;"
241
0
        "movdqa %%xmm9,%16;"
242
0
        "movdqa %%xmm4,%%xmm0;"
243
0
        "mov    %3,%10;"
244
0
        "ror    $0xe,%10;"
245
0
        "mov    %k2,%11;"
246
0
        "palignr $0x4,%%xmm7,%%xmm0;"
247
0
        "ror    $0x9,%11;"
248
0
        "xor    %3,%10;"
249
0
        "mov    %4,%12;"
250
0
        "ror    $0x5,%10;"
251
0
        "movdqa %%xmm6,%%xmm1;"
252
0
        "xor    %k2,%11;"
253
0
        "xor    %5,%12;"
254
0
        "paddd  %%xmm5,%%xmm0;"
255
0
        "xor    %3,%10;"
256
0
        "and    %3,%12;"
257
0
        "ror    $0xb,%11;"
258
0
        "palignr $0x4,%%xmm5,%%xmm1;"
259
0
        "xor    %k2,%11;"
260
0
        "ror    $0x6,%10;"
261
0
        "xor    %5,%12;"
262
0
        "movdqa %%xmm1,%%xmm2;"
263
0
        "ror    $0x2,%11;"
264
0
        "add    %10,%12;"
265
0
        "add    %16,%12;"
266
0
        "movdqa %%xmm1,%%xmm3;"
267
0
        "mov    %k2,%10;"
268
0
        "add    %12,%6;"
269
0
        "mov    %k2,%12;"
270
0
        "pslld  $0x19,%%xmm1;"
271
0
        "or     %8,%10;"
272
0
        "add    %6,%9;"
273
0
        "and    %8,%12;"
274
0
        "psrld  $0x7,%%xmm2;"
275
0
        "and    %7,%10;"
276
0
        "add    %11,%6;"
277
0
        "por    %%xmm2,%%xmm1;"
278
0
        "or     %12,%10;"
279
0
        "add    %10,%6;"
280
0
        "movdqa %%xmm3,%%xmm2;"
281
0
        "mov    %9,%10;"
282
0
        "mov    %6,%11;"
283
0
        "movdqa %%xmm3,%%xmm8;"
284
0
        "ror    $0xe,%10;"
285
0
        "xor    %9,%10;"
286
0
        "mov    %3,%12;"
287
0
        "ror    $0x9,%11;"
288
0
        "pslld  $0xe,%%xmm3;"
289
0
        "xor    %6,%11;"
290
0
        "ror    $0x5,%10;"
291
0
        "xor    %4,%12;"
292
0
        "psrld  $0x12,%%xmm2;"
293
0
        "ror    $0xb,%11;"
294
0
        "xor    %9,%10;"
295
0
        "and    %9,%12;"
296
0
        "ror    $0x6,%10;"
297
0
        "pxor   %%xmm3,%%xmm1;"
298
0
        "xor    %6,%11;"
299
0
        "xor    %4,%12;"
300
0
        "psrld  $0x3,%%xmm8;"
301
0
        "add    %10,%12;"
302
0
        "add    4+%16,%12;"
303
0
        "ror    $0x2,%11;"
304
0
        "pxor   %%xmm2,%%xmm1;"
305
0
        "mov    %6,%10;"
306
0
        "add    %12,%5;"
307
0
        "mov    %6,%12;"
308
0
        "pxor   %%xmm8,%%xmm1;"
309
0
        "or     %7,%10;"
310
0
        "add    %5,%8;"
311
0
        "and    %7,%12;"
312
0
        "pshufd $0xfa,%%xmm4,%%xmm2;"
313
0
        "and    %k2,%10;"
314
0
        "add    %11,%5;"
315
0
        "paddd  %%xmm1,%%xmm0;"
316
0
        "or     %12,%10;"
317
0
        "add    %10,%5;"
318
0
        "movdqa %%xmm2,%%xmm3;"
319
0
        "mov    %8,%10;"
320
0
        "mov    %5,%11;"
321
0
        "ror    $0xe,%10;"
322
0
        "movdqa %%xmm2,%%xmm8;"
323
0
        "xor    %8,%10;"
324
0
        "ror    $0x9,%11;"
325
0
        "mov    %9,%12;"
326
0
        "xor    %5,%11;"
327
0
        "ror    $0x5,%10;"
328
0
        "psrlq  $0x11,%%xmm2;"
329
0
        "xor    %3,%12;"
330
0
        "psrlq  $0x13,%%xmm3;"
331
0
        "xor    %8,%10;"
332
0
        "and    %8,%12;"
333
0
        "psrld  $0xa,%%xmm8;"
334
0
        "ror    $0xb,%11;"
335
0
        "xor    %5,%11;"
336
0
        "xor    %3,%12;"
337
0
        "ror    $0x6,%10;"
338
0
        "pxor   %%xmm3,%%xmm2;"
339
0
        "add    %10,%12;"
340
0
        "ror    $0x2,%11;"
341
0
        "add    8+%16,%12;"
342
0
        "pxor   %%xmm2,%%xmm8;"
343
0
        "mov    %5,%10;"
344
0
        "add    %12,%4;"
345
0
        "mov    %5,%12;"
346
0
        "pshufb %%xmm10,%%xmm8;"
347
0
        "or     %k2,%10;"
348
0
        "add    %4,%7;"
349
0
        "and    %k2,%12;"
350
0
        "paddd  %%xmm8,%%xmm0;"
351
0
        "and    %6,%10;"
352
0
        "add    %11,%4;"
353
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
354
0
        "or     %12,%10;"
355
0
        "add    %10,%4;"
356
0
        "movdqa %%xmm2,%%xmm3;"
357
0
        "mov    %7,%10;"
358
0
        "ror    $0xe,%10;"
359
0
        "mov    %4,%11;"
360
0
        "movdqa %%xmm2,%%xmm5;"
361
0
        "ror    $0x9,%11;"
362
0
        "xor    %7,%10;"
363
0
        "mov    %8,%12;"
364
0
        "ror    $0x5,%10;"
365
0
        "psrlq  $0x11,%%xmm2;"
366
0
        "xor    %4,%11;"
367
0
        "xor    %9,%12;"
368
0
        "psrlq  $0x13,%%xmm3;"
369
0
        "xor    %7,%10;"
370
0
        "and    %7,%12;"
371
0
        "ror    $0xb,%11;"
372
0
        "psrld  $0xa,%%xmm5;"
373
0
        "xor    %4,%11;"
374
0
        "ror    $0x6,%10;"
375
0
        "xor    %9,%12;"
376
0
        "pxor   %%xmm3,%%xmm2;"
377
0
        "ror    $0x2,%11;"
378
0
        "add    %10,%12;"
379
0
        "add    12+%16,%12;"
380
0
        "pxor   %%xmm2,%%xmm5;"
381
0
        "mov    %4,%10;"
382
0
        "add    %12,%3;"
383
0
        "mov    %4,%12;"
384
0
        "pshufb %%xmm11,%%xmm5;"
385
0
        "or     %6,%10;"
386
0
        "add    %3,%k2;"
387
0
        "and    %6,%12;"
388
0
        "paddd  %%xmm0,%%xmm5;"
389
0
        "and    %5,%10;"
390
0
        "add    %11,%3;"
391
0
        "or     %12,%10;"
392
0
        "add    %10,%3;"
393
0
        "movdqa 0x20(%13),%%xmm9;"
394
0
        "paddd  %%xmm6,%%xmm9;"
395
0
        "movdqa %%xmm9,%16;"
396
0
        "movdqa %%xmm5,%%xmm0;"
397
0
        "mov    %k2,%10;"
398
0
        "ror    $0xe,%10;"
399
0
        "mov    %3,%11;"
400
0
        "palignr $0x4,%%xmm4,%%xmm0;"
401
0
        "ror    $0x9,%11;"
402
0
        "xor    %k2,%10;"
403
0
        "mov    %7,%12;"
404
0
        "ror    $0x5,%10;"
405
0
        "movdqa %%xmm7,%%xmm1;"
406
0
        "xor    %3,%11;"
407
0
        "xor    %8,%12;"
408
0
        "paddd  %%xmm6,%%xmm0;"
409
0
        "xor    %k2,%10;"
410
0
        "and    %k2,%12;"
411
0
        "ror    $0xb,%11;"
412
0
        "palignr $0x4,%%xmm6,%%xmm1;"
413
0
        "xor    %3,%11;"
414
0
        "ror    $0x6,%10;"
415
0
        "xor    %8,%12;"
416
0
        "movdqa %%xmm1,%%xmm2;"
417
0
        "ror    $0x2,%11;"
418
0
        "add    %10,%12;"
419
0
        "add    %16,%12;"
420
0
        "movdqa %%xmm1,%%xmm3;"
421
0
        "mov    %3,%10;"
422
0
        "add    %12,%9;"
423
0
        "mov    %3,%12;"
424
0
        "pslld  $0x19,%%xmm1;"
425
0
        "or     %5,%10;"
426
0
        "add    %9,%6;"
427
0
        "and    %5,%12;"
428
0
        "psrld  $0x7,%%xmm2;"
429
0
        "and    %4,%10;"
430
0
        "add    %11,%9;"
431
0
        "por    %%xmm2,%%xmm1;"
432
0
        "or     %12,%10;"
433
0
        "add    %10,%9;"
434
0
        "movdqa %%xmm3,%%xmm2;"
435
0
        "mov    %6,%10;"
436
0
        "mov    %9,%11;"
437
0
        "movdqa %%xmm3,%%xmm8;"
438
0
        "ror    $0xe,%10;"
439
0
        "xor    %6,%10;"
440
0
        "mov    %k2,%12;"
441
0
        "ror    $0x9,%11;"
442
0
        "pslld  $0xe,%%xmm3;"
443
0
        "xor    %9,%11;"
444
0
        "ror    $0x5,%10;"
445
0
        "xor    %7,%12;"
446
0
        "psrld  $0x12,%%xmm2;"
447
0
        "ror    $0xb,%11;"
448
0
        "xor    %6,%10;"
449
0
        "and    %6,%12;"
450
0
        "ror    $0x6,%10;"
451
0
        "pxor   %%xmm3,%%xmm1;"
452
0
        "xor    %9,%11;"
453
0
        "xor    %7,%12;"
454
0
        "psrld  $0x3,%%xmm8;"
455
0
        "add    %10,%12;"
456
0
        "add    4+%16,%12;"
457
0
        "ror    $0x2,%11;"
458
0
        "pxor   %%xmm2,%%xmm1;"
459
0
        "mov    %9,%10;"
460
0
        "add    %12,%8;"
461
0
        "mov    %9,%12;"
462
0
        "pxor   %%xmm8,%%xmm1;"
463
0
        "or     %4,%10;"
464
0
        "add    %8,%5;"
465
0
        "and    %4,%12;"
466
0
        "pshufd $0xfa,%%xmm5,%%xmm2;"
467
0
        "and    %3,%10;"
468
0
        "add    %11,%8;"
469
0
        "paddd  %%xmm1,%%xmm0;"
470
0
        "or     %12,%10;"
471
0
        "add    %10,%8;"
472
0
        "movdqa %%xmm2,%%xmm3;"
473
0
        "mov    %5,%10;"
474
0
        "mov    %8,%11;"
475
0
        "ror    $0xe,%10;"
476
0
        "movdqa %%xmm2,%%xmm8;"
477
0
        "xor    %5,%10;"
478
0
        "ror    $0x9,%11;"
479
0
        "mov    %6,%12;"
480
0
        "xor    %8,%11;"
481
0
        "ror    $0x5,%10;"
482
0
        "psrlq  $0x11,%%xmm2;"
483
0
        "xor    %k2,%12;"
484
0
        "psrlq  $0x13,%%xmm3;"
485
0
        "xor    %5,%10;"
486
0
        "and    %5,%12;"
487
0
        "psrld  $0xa,%%xmm8;"
488
0
        "ror    $0xb,%11;"
489
0
        "xor    %8,%11;"
490
0
        "xor    %k2,%12;"
491
0
        "ror    $0x6,%10;"
492
0
        "pxor   %%xmm3,%%xmm2;"
493
0
        "add    %10,%12;"
494
0
        "ror    $0x2,%11;"
495
0
        "add    8+%16,%12;"
496
0
        "pxor   %%xmm2,%%xmm8;"
497
0
        "mov    %8,%10;"
498
0
        "add    %12,%7;"
499
0
        "mov    %8,%12;"
500
0
        "pshufb %%xmm10,%%xmm8;"
501
0
        "or     %3,%10;"
502
0
        "add    %7,%4;"
503
0
        "and    %3,%12;"
504
0
        "paddd  %%xmm8,%%xmm0;"
505
0
        "and    %9,%10;"
506
0
        "add    %11,%7;"
507
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
508
0
        "or     %12,%10;"
509
0
        "add    %10,%7;"
510
0
        "movdqa %%xmm2,%%xmm3;"
511
0
        "mov    %4,%10;"
512
0
        "ror    $0xe,%10;"
513
0
        "mov    %7,%11;"
514
0
        "movdqa %%xmm2,%%xmm6;"
515
0
        "ror    $0x9,%11;"
516
0
        "xor    %4,%10;"
517
0
        "mov    %5,%12;"
518
0
        "ror    $0x5,%10;"
519
0
        "psrlq  $0x11,%%xmm2;"
520
0
        "xor    %7,%11;"
521
0
        "xor    %6,%12;"
522
0
        "psrlq  $0x13,%%xmm3;"
523
0
        "xor    %4,%10;"
524
0
        "and    %4,%12;"
525
0
        "ror    $0xb,%11;"
526
0
        "psrld  $0xa,%%xmm6;"
527
0
        "xor    %7,%11;"
528
0
        "ror    $0x6,%10;"
529
0
        "xor    %6,%12;"
530
0
        "pxor   %%xmm3,%%xmm2;"
531
0
        "ror    $0x2,%11;"
532
0
        "add    %10,%12;"
533
0
        "add    12+%16,%12;"
534
0
        "pxor   %%xmm2,%%xmm6;"
535
0
        "mov    %7,%10;"
536
0
        "add    %12,%k2;"
537
0
        "mov    %7,%12;"
538
0
        "pshufb %%xmm11,%%xmm6;"
539
0
        "or     %9,%10;"
540
0
        "add    %k2,%3;"
541
0
        "and    %9,%12;"
542
0
        "paddd  %%xmm0,%%xmm6;"
543
0
        "and    %8,%10;"
544
0
        "add    %11,%k2;"
545
0
        "or     %12,%10;"
546
0
        "add    %10,%k2;"
547
0
        "movdqa 0x30(%13),%%xmm9;"
548
0
        "paddd  %%xmm7,%%xmm9;"
549
0
        "movdqa %%xmm9,%16;"
550
0
        "add    $0x40,%13;"
551
0
        "movdqa %%xmm6,%%xmm0;"
552
0
        "mov    %3,%10;"
553
0
        "ror    $0xe,%10;"
554
0
        "mov    %k2,%11;"
555
0
        "palignr $0x4,%%xmm5,%%xmm0;"
556
0
        "ror    $0x9,%11;"
557
0
        "xor    %3,%10;"
558
0
        "mov    %4,%12;"
559
0
        "ror    $0x5,%10;"
560
0
        "movdqa %%xmm4,%%xmm1;"
561
0
        "xor    %k2,%11;"
562
0
        "xor    %5,%12;"
563
0
        "paddd  %%xmm7,%%xmm0;"
564
0
        "xor    %3,%10;"
565
0
        "and    %3,%12;"
566
0
        "ror    $0xb,%11;"
567
0
        "palignr $0x4,%%xmm7,%%xmm1;"
568
0
        "xor    %k2,%11;"
569
0
        "ror    $0x6,%10;"
570
0
        "xor    %5,%12;"
571
0
        "movdqa %%xmm1,%%xmm2;"
572
0
        "ror    $0x2,%11;"
573
0
        "add    %10,%12;"
574
0
        "add    %16,%12;"
575
0
        "movdqa %%xmm1,%%xmm3;"
576
0
        "mov    %k2,%10;"
577
0
        "add    %12,%6;"
578
0
        "mov    %k2,%12;"
579
0
        "pslld  $0x19,%%xmm1;"
580
0
        "or     %8,%10;"
581
0
        "add    %6,%9;"
582
0
        "and    %8,%12;"
583
0
        "psrld  $0x7,%%xmm2;"
584
0
        "and    %7,%10;"
585
0
        "add    %11,%6;"
586
0
        "por    %%xmm2,%%xmm1;"
587
0
        "or     %12,%10;"
588
0
        "add    %10,%6;"
589
0
        "movdqa %%xmm3,%%xmm2;"
590
0
        "mov    %9,%10;"
591
0
        "mov    %6,%11;"
592
0
        "movdqa %%xmm3,%%xmm8;"
593
0
        "ror    $0xe,%10;"
594
0
        "xor    %9,%10;"
595
0
        "mov    %3,%12;"
596
0
        "ror    $0x9,%11;"
597
0
        "pslld  $0xe,%%xmm3;"
598
0
        "xor    %6,%11;"
599
0
        "ror    $0x5,%10;"
600
0
        "xor    %4,%12;"
601
0
        "psrld  $0x12,%%xmm2;"
602
0
        "ror    $0xb,%11;"
603
0
        "xor    %9,%10;"
604
0
        "and    %9,%12;"
605
0
        "ror    $0x6,%10;"
606
0
        "pxor   %%xmm3,%%xmm1;"
607
0
        "xor    %6,%11;"
608
0
        "xor    %4,%12;"
609
0
        "psrld  $0x3,%%xmm8;"
610
0
        "add    %10,%12;"
611
0
        "add    4+%16,%12;"
612
0
        "ror    $0x2,%11;"
613
0
        "pxor   %%xmm2,%%xmm1;"
614
0
        "mov    %6,%10;"
615
0
        "add    %12,%5;"
616
0
        "mov    %6,%12;"
617
0
        "pxor   %%xmm8,%%xmm1;"
618
0
        "or     %7,%10;"
619
0
        "add    %5,%8;"
620
0
        "and    %7,%12;"
621
0
        "pshufd $0xfa,%%xmm6,%%xmm2;"
622
0
        "and    %k2,%10;"
623
0
        "add    %11,%5;"
624
0
        "paddd  %%xmm1,%%xmm0;"
625
0
        "or     %12,%10;"
626
0
        "add    %10,%5;"
627
0
        "movdqa %%xmm2,%%xmm3;"
628
0
        "mov    %8,%10;"
629
0
        "mov    %5,%11;"
630
0
        "ror    $0xe,%10;"
631
0
        "movdqa %%xmm2,%%xmm8;"
632
0
        "xor    %8,%10;"
633
0
        "ror    $0x9,%11;"
634
0
        "mov    %9,%12;"
635
0
        "xor    %5,%11;"
636
0
        "ror    $0x5,%10;"
637
0
        "psrlq  $0x11,%%xmm2;"
638
0
        "xor    %3,%12;"
639
0
        "psrlq  $0x13,%%xmm3;"
640
0
        "xor    %8,%10;"
641
0
        "and    %8,%12;"
642
0
        "psrld  $0xa,%%xmm8;"
643
0
        "ror    $0xb,%11;"
644
0
        "xor    %5,%11;"
645
0
        "xor    %3,%12;"
646
0
        "ror    $0x6,%10;"
647
0
        "pxor   %%xmm3,%%xmm2;"
648
0
        "add    %10,%12;"
649
0
        "ror    $0x2,%11;"
650
0
        "add    8+%16,%12;"
651
0
        "pxor   %%xmm2,%%xmm8;"
652
0
        "mov    %5,%10;"
653
0
        "add    %12,%4;"
654
0
        "mov    %5,%12;"
655
0
        "pshufb %%xmm10,%%xmm8;"
656
0
        "or     %k2,%10;"
657
0
        "add    %4,%7;"
658
0
        "and    %k2,%12;"
659
0
        "paddd  %%xmm8,%%xmm0;"
660
0
        "and    %6,%10;"
661
0
        "add    %11,%4;"
662
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
663
0
        "or     %12,%10;"
664
0
        "add    %10,%4;"
665
0
        "movdqa %%xmm2,%%xmm3;"
666
0
        "mov    %7,%10;"
667
0
        "ror    $0xe,%10;"
668
0
        "mov    %4,%11;"
669
0
        "movdqa %%xmm2,%%xmm7;"
670
0
        "ror    $0x9,%11;"
671
0
        "xor    %7,%10;"
672
0
        "mov    %8,%12;"
673
0
        "ror    $0x5,%10;"
674
0
        "psrlq  $0x11,%%xmm2;"
675
0
        "xor    %4,%11;"
676
0
        "xor    %9,%12;"
677
0
        "psrlq  $0x13,%%xmm3;"
678
0
        "xor    %7,%10;"
679
0
        "and    %7,%12;"
680
0
        "ror    $0xb,%11;"
681
0
        "psrld  $0xa,%%xmm7;"
682
0
        "xor    %4,%11;"
683
0
        "ror    $0x6,%10;"
684
0
        "xor    %9,%12;"
685
0
        "pxor   %%xmm3,%%xmm2;"
686
0
        "ror    $0x2,%11;"
687
0
        "add    %10,%12;"
688
0
        "add    12+%16,%12;"
689
0
        "pxor   %%xmm2,%%xmm7;"
690
0
        "mov    %4,%10;"
691
0
        "add    %12,%3;"
692
0
        "mov    %4,%12;"
693
0
        "pshufb %%xmm11,%%xmm7;"
694
0
        "or     %6,%10;"
695
0
        "add    %3,%k2;"
696
0
        "and    %6,%12;"
697
0
        "paddd  %%xmm0,%%xmm7;"
698
0
        "and    %5,%10;"
699
0
        "add    %11,%3;"
700
0
        "or     %12,%10;"
701
0
        "add    %10,%3;"
702
0
        "sub    $0x1,%1;"
703
0
        "jne    Lloop1_%=;"
704
0
        "mov    $0x2,%1;"
705
706
0
        "Lloop2_%=:"
707
0
        "paddd  0x0(%13),%%xmm4;"
708
0
        "movdqa %%xmm4,%16;"
709
0
        "mov    %k2,%10;"
710
0
        "ror    $0xe,%10;"
711
0
        "mov    %3,%11;"
712
0
        "xor    %k2,%10;"
713
0
        "ror    $0x9,%11;"
714
0
        "mov    %7,%12;"
715
0
        "xor    %3,%11;"
716
0
        "ror    $0x5,%10;"
717
0
        "xor    %8,%12;"
718
0
        "xor    %k2,%10;"
719
0
        "ror    $0xb,%11;"
720
0
        "and    %k2,%12;"
721
0
        "xor    %3,%11;"
722
0
        "ror    $0x6,%10;"
723
0
        "xor    %8,%12;"
724
0
        "add    %10,%12;"
725
0
        "ror    $0x2,%11;"
726
0
        "add    %16,%12;"
727
0
        "mov    %3,%10;"
728
0
        "add    %12,%9;"
729
0
        "mov    %3,%12;"
730
0
        "or     %5,%10;"
731
0
        "add    %9,%6;"
732
0
        "and    %5,%12;"
733
0
        "and    %4,%10;"
734
0
        "add    %11,%9;"
735
0
        "or     %12,%10;"
736
0
        "add    %10,%9;"
737
0
        "mov    %6,%10;"
738
0
        "ror    $0xe,%10;"
739
0
        "mov    %9,%11;"
740
0
        "xor    %6,%10;"
741
0
        "ror    $0x9,%11;"
742
0
        "mov    %k2,%12;"
743
0
        "xor    %9,%11;"
744
0
        "ror    $0x5,%10;"
745
0
        "xor    %7,%12;"
746
0
        "xor    %6,%10;"
747
0
        "ror    $0xb,%11;"
748
0
        "and    %6,%12;"
749
0
        "xor    %9,%11;"
750
0
        "ror    $0x6,%10;"
751
0
        "xor    %7,%12;"
752
0
        "add    %10,%12;"
753
0
        "ror    $0x2,%11;"
754
0
        "add    4+%16,%12;"
755
0
        "mov    %9,%10;"
756
0
        "add    %12,%8;"
757
0
        "mov    %9,%12;"
758
0
        "or     %4,%10;"
759
0
        "add    %8,%5;"
760
0
        "and    %4,%12;"
761
0
        "and    %3,%10;"
762
0
        "add    %11,%8;"
763
0
        "or     %12,%10;"
764
0
        "add    %10,%8;"
765
0
        "mov    %5,%10;"
766
0
        "ror    $0xe,%10;"
767
0
        "mov    %8,%11;"
768
0
        "xor    %5,%10;"
769
0
        "ror    $0x9,%11;"
770
0
        "mov    %6,%12;"
771
0
        "xor    %8,%11;"
772
0
        "ror    $0x5,%10;"
773
0
        "xor    %k2,%12;"
774
0
        "xor    %5,%10;"
775
0
        "ror    $0xb,%11;"
776
0
        "and    %5,%12;"
777
0
        "xor    %8,%11;"
778
0
        "ror    $0x6,%10;"
779
0
        "xor    %k2,%12;"
780
0
        "add    %10,%12;"
781
0
        "ror    $0x2,%11;"
782
0
        "add    8+%16,%12;"
783
0
        "mov    %8,%10;"
784
0
        "add    %12,%7;"
785
0
        "mov    %8,%12;"
786
0
        "or     %3,%10;"
787
0
        "add    %7,%4;"
788
0
        "and    %3,%12;"
789
0
        "and    %9,%10;"
790
0
        "add    %11,%7;"
791
0
        "or     %12,%10;"
792
0
        "add    %10,%7;"
793
0
        "mov    %4,%10;"
794
0
        "ror    $0xe,%10;"
795
0
        "mov    %7,%11;"
796
0
        "xor    %4,%10;"
797
0
        "ror    $0x9,%11;"
798
0
        "mov    %5,%12;"
799
0
        "xor    %7,%11;"
800
0
        "ror    $0x5,%10;"
801
0
        "xor    %6,%12;"
802
0
        "xor    %4,%10;"
803
0
        "ror    $0xb,%11;"
804
0
        "and    %4,%12;"
805
0
        "xor    %7,%11;"
806
0
        "ror    $0x6,%10;"
807
0
        "xor    %6,%12;"
808
0
        "add    %10,%12;"
809
0
        "ror    $0x2,%11;"
810
0
        "add    12+%16,%12;"
811
0
        "mov    %7,%10;"
812
0
        "add    %12,%k2;"
813
0
        "mov    %7,%12;"
814
0
        "or     %9,%10;"
815
0
        "add    %k2,%3;"
816
0
        "and    %9,%12;"
817
0
        "and    %8,%10;"
818
0
        "add    %11,%k2;"
819
0
        "or     %12,%10;"
820
0
        "add    %10,%k2;"
821
0
        "paddd  0x10(%13),%%xmm5;"
822
0
        "movdqa %%xmm5,%16;"
823
0
        "add    $0x20,%13;"
824
0
        "mov    %3,%10;"
825
0
        "ror    $0xe,%10;"
826
0
        "mov    %k2,%11;"
827
0
        "xor    %3,%10;"
828
0
        "ror    $0x9,%11;"
829
0
        "mov    %4,%12;"
830
0
        "xor    %k2,%11;"
831
0
        "ror    $0x5,%10;"
832
0
        "xor    %5,%12;"
833
0
        "xor    %3,%10;"
834
0
        "ror    $0xb,%11;"
835
0
        "and    %3,%12;"
836
0
        "xor    %k2,%11;"
837
0
        "ror    $0x6,%10;"
838
0
        "xor    %5,%12;"
839
0
        "add    %10,%12;"
840
0
        "ror    $0x2,%11;"
841
0
        "add    %16,%12;"
842
0
        "mov    %k2,%10;"
843
0
        "add    %12,%6;"
844
0
        "mov    %k2,%12;"
845
0
        "or     %8,%10;"
846
0
        "add    %6,%9;"
847
0
        "and    %8,%12;"
848
0
        "and    %7,%10;"
849
0
        "add    %11,%6;"
850
0
        "or     %12,%10;"
851
0
        "add    %10,%6;"
852
0
        "mov    %9,%10;"
853
0
        "ror    $0xe,%10;"
854
0
        "mov    %6,%11;"
855
0
        "xor    %9,%10;"
856
0
        "ror    $0x9,%11;"
857
0
        "mov    %3,%12;"
858
0
        "xor    %6,%11;"
859
0
        "ror    $0x5,%10;"
860
0
        "xor    %4,%12;"
861
0
        "xor    %9,%10;"
862
0
        "ror    $0xb,%11;"
863
0
        "and    %9,%12;"
864
0
        "xor    %6,%11;"
865
0
        "ror    $0x6,%10;"
866
0
        "xor    %4,%12;"
867
0
        "add    %10,%12;"
868
0
        "ror    $0x2,%11;"
869
0
        "add    4+%16,%12;"
870
0
        "mov    %6,%10;"
871
0
        "add    %12,%5;"
872
0
        "mov    %6,%12;"
873
0
        "or     %7,%10;"
874
0
        "add    %5,%8;"
875
0
        "and    %7,%12;"
876
0
        "and    %k2,%10;"
877
0
        "add    %11,%5;"
878
0
        "or     %12,%10;"
879
0
        "add    %10,%5;"
880
0
        "mov    %8,%10;"
881
0
        "ror    $0xe,%10;"
882
0
        "mov    %5,%11;"
883
0
        "xor    %8,%10;"
884
0
        "ror    $0x9,%11;"
885
0
        "mov    %9,%12;"
886
0
        "xor    %5,%11;"
887
0
        "ror    $0x5,%10;"
888
0
        "xor    %3,%12;"
889
0
        "xor    %8,%10;"
890
0
        "ror    $0xb,%11;"
891
0
        "and    %8,%12;"
892
0
        "xor    %5,%11;"
893
0
        "ror    $0x6,%10;"
894
0
        "xor    %3,%12;"
895
0
        "add    %10,%12;"
896
0
        "ror    $0x2,%11;"
897
0
        "add    8+%16,%12;"
898
0
        "mov    %5,%10;"
899
0
        "add    %12,%4;"
900
0
        "mov    %5,%12;"
901
0
        "or     %k2,%10;"
902
0
        "add    %4,%7;"
903
0
        "and    %k2,%12;"
904
0
        "and    %6,%10;"
905
0
        "add    %11,%4;"
906
0
        "or     %12,%10;"
907
0
        "add    %10,%4;"
908
0
        "mov    %7,%10;"
909
0
        "ror    $0xe,%10;"
910
0
        "mov    %4,%11;"
911
0
        "xor    %7,%10;"
912
0
        "ror    $0x9,%11;"
913
0
        "mov    %8,%12;"
914
0
        "xor    %4,%11;"
915
0
        "ror    $0x5,%10;"
916
0
        "xor    %9,%12;"
917
0
        "xor    %7,%10;"
918
0
        "ror    $0xb,%11;"
919
0
        "and    %7,%12;"
920
0
        "xor    %4,%11;"
921
0
        "ror    $0x6,%10;"
922
0
        "xor    %9,%12;"
923
0
        "add    %10,%12;"
924
0
        "ror    $0x2,%11;"
925
0
        "add    12+%16,%12;"
926
0
        "mov    %4,%10;"
927
0
        "add    %12,%3;"
928
0
        "mov    %4,%12;"
929
0
        "or     %6,%10;"
930
0
        "add    %3,%k2;"
931
0
        "and    %6,%12;"
932
0
        "and    %5,%10;"
933
0
        "add    %11,%3;"
934
0
        "or     %12,%10;"
935
0
        "add    %10,%3;"
936
0
        "movdqa %%xmm6,%%xmm4;"
937
0
        "movdqa %%xmm7,%%xmm5;"
938
0
        "sub    $0x1,%1;"
939
0
        "jne    Lloop2_%=;"
940
0
        "add    (%0),%3;"
941
0
        "mov    %3,(%0);"
942
0
        "add    0x4(%0),%4;"
943
0
        "mov    %4,0x4(%0);"
944
0
        "add    0x8(%0),%5;"
945
0
        "mov    %5,0x8(%0);"
946
0
        "add    0xc(%0),%6;"
947
0
        "mov    %6,0xc(%0);"
948
0
        "add    0x10(%0),%k2;"
949
0
        "mov    %k2,0x10(%0);"
950
0
        "add    0x14(%0),%7;"
951
0
        "mov    %7,0x14(%0);"
952
0
        "add    0x18(%0),%8;"
953
0
        "mov    %8,0x18(%0);"
954
0
        "add    0x1c(%0),%9;"
955
0
        "mov    %9,0x1c(%0);"
956
0
        "mov    %15,%1;"
957
0
        "add    $0x40,%1;"
958
0
        "cmp    %14,%1;"
959
0
        "jne    Lloop0_%=;"
960
961
0
        "Ldone_hash_%=:"
962
963
0
        : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
964
0
        : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
965
0
        : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
966
0
   );
967
0
}
968
}
969
970
/*
971
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
972
; Copyright (c) 2012, Intel Corporation 
973
; 
974
; All rights reserved. 
975
; 
976
; Redistribution and use in source and binary forms, with or without
977
; modification, are permitted provided that the following conditions are
978
; met: 
979
; 
980
; * Redistributions of source code must retain the above copyright
981
;   notice, this list of conditions and the following disclaimer.  
982
; 
983
; * Redistributions in binary form must reproduce the above copyright
984
;   notice, this list of conditions and the following disclaimer in the
985
;   documentation and/or other materials provided with the
986
;   distribution. 
987
; 
988
; * Neither the name of the Intel Corporation nor the names of its
989
;   contributors may be used to endorse or promote products derived from
990
;   this software without specific prior written permission. 
991
; 
992
; 
993
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
994
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
995
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
996
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
997
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
998
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
999
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1000
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1001
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1002
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1003
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1004
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1005
;
1006
; Example YASM command lines:
1007
; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1008
; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1009
;
1010
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011
;
1012
; This code is described in an Intel White-Paper:
1013
; "Fast SHA-256 Implementations on Intel Architecture Processors"
1014
;
1015
; To find it, surf to https://www.intel.com/p/en_US/embedded
1016
; and search for that title.
1017
; The paper is expected to be released roughly at the end of April, 2012
1018
;
1019
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1020
; This code schedules 1 blocks at a time, with 4 lanes per block
1021
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1022
1023
%define MOVDQ movdqu ;; assume buffers not aligned 
1024
1025
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1026
1027
; addm [mem], reg
1028
; Add reg to mem using reg-mem add and store
1029
%macro addm 2
1030
    add %2, %1
1031
    mov %1, %2
1032
%endm
1033
1034
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1035
1036
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1037
; Load xmm with mem and byte swap each dword
1038
%macro COPY_XMM_AND_BSWAP 3
1039
    MOVDQ %1, %2
1040
    pshufb %1, %3
1041
%endmacro
1042
1043
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1044
1045
%define X0 xmm4
1046
%define X1 xmm5
1047
%define X2 xmm6
1048
%define X3 xmm7
1049
1050
%define XTMP0 xmm0
1051
%define XTMP1 xmm1
1052
%define XTMP2 xmm2
1053
%define XTMP3 xmm3
1054
%define XTMP4 xmm8
1055
%define XFER  xmm9
1056
1057
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1058
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1059
%define BYTE_FLIP_MASK  xmm12
1060
    
1061
%ifdef LINUX
1062
%define NUM_BLKS rdx  ; 3rd arg
1063
%define CTX rsi ; 2nd arg
1064
%define INP rdi ; 1st arg
1065
1066
%define SRND  rdi ; clobbers INP
1067
%define c ecx
1068
%define d   r8d
1069
%define e   edx
1070
%else
1071
%define NUM_BLKS r8 ; 3rd arg
1072
%define CTX rdx   ; 2nd arg
1073
%define INP rcx   ; 1st arg
1074
1075
%define SRND  rcx ; clobbers INP
1076
%define c   edi 
1077
%define d esi 
1078
%define e   r8d
1079
    
1080
%endif
1081
%define TBL rbp
1082
%define a eax
1083
%define b ebx
1084
1085
%define f r9d
1086
%define g r10d
1087
%define h r11d
1088
1089
%define y0 r13d
1090
%define y1 r14d
1091
%define y2 r15d
1092
1093
1094
1095
_INP_END_SIZE equ 8
1096
_INP_SIZE equ 8
1097
_XFER_SIZE  equ 8
1098
%ifdef LINUX
1099
_XMM_SAVE_SIZE  equ 0
1100
%else
1101
_XMM_SAVE_SIZE  equ 7*16
1102
%endif
1103
; STACK_SIZE plus pushes must be an odd multiple of 8
1104
_ALIGN_SIZE equ 8
1105
1106
_INP_END  equ 0
1107
_INP    equ _INP_END  + _INP_END_SIZE
1108
_XFER   equ _INP      + _INP_SIZE
1109
_XMM_SAVE equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1110
STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
1111
1112
; rotate_Xs
1113
; Rotate values of symbols X0...X3
1114
%macro rotate_Xs 0
1115
%xdefine X_ X0
1116
%xdefine X0 X1
1117
%xdefine X1 X2
1118
%xdefine X2 X3
1119
%xdefine X3 X_
1120
%endm
1121
1122
; ROTATE_ARGS
1123
; Rotate values of symbols a...h
1124
%macro ROTATE_ARGS 0
1125
%xdefine TMP_ h
1126
%xdefine h g
1127
%xdefine g f
1128
%xdefine f e
1129
%xdefine e d
1130
%xdefine d c
1131
%xdefine c b
1132
%xdefine b a
1133
%xdefine a TMP_
1134
%endm
1135
1136
%macro FOUR_ROUNDS_AND_SCHED 0
1137
  ;; compute s0 four at a time and s1 two at a time
1138
  ;; compute W[-16] + W[-7] 4 at a time
1139
  movdqa  XTMP0, X3
1140
    mov y0, e   ; y0 = e
1141
    ror y0, (25-11) ; y0 = e >> (25-11)
1142
    mov y1, a   ; y1 = a
1143
  palignr XTMP0, X2, 4  ; XTMP0 = W[-7]
1144
    ror y1, (22-13) ; y1 = a >> (22-13)
1145
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1146
    mov y2, f   ; y2 = f
1147
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1148
  movdqa  XTMP1, X1
1149
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1150
    xor y2, g   ; y2 = f^g
1151
  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1152
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1153
    and y2, e   ; y2 = (f^g)&e
1154
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1155
  ;; compute s0
1156
  palignr XTMP1, X0, 4  ; XTMP1 = W[-15]
1157
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1158
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1159
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1160
  movdqa  XTMP2, XTMP1  ; XTMP2 = W[-15]
1161
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1162
    add y2, y0    ; y2 = S1 + CH
1163
    add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1164
  movdqa  XTMP3, XTMP1  ; XTMP3 = W[-15]
1165
    mov y0, a   ; y0 = a
1166
    add h, y2   ; h = h + S1 + CH + k + w
1167
    mov y2, a   ; y2 = a
1168
  pslld XTMP1, (32-7)
1169
    or  y0, c   ; y0 = a|c
1170
    add d, h    ; d = d + h + S1 + CH + k + w
1171
    and y2, c   ; y2 = a&c
1172
  psrld XTMP2, 7
1173
    and y0, b   ; y0 = (a|c)&b
1174
    add h, y1   ; h = h + S1 + CH + k + w + S0
1175
  por XTMP1, XTMP2  ; XTMP1 = W[-15] ror 7
1176
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1177
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1178
1179
ROTATE_ARGS
1180
  movdqa  XTMP2, XTMP3  ; XTMP2 = W[-15]
1181
    mov y0, e   ; y0 = e
1182
    mov y1, a   ; y1 = a
1183
  movdqa  XTMP4, XTMP3  ; XTMP4 = W[-15]
1184
    ror y0, (25-11) ; y0 = e >> (25-11)
1185
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1186
    mov y2, f   ; y2 = f
1187
    ror y1, (22-13) ; y1 = a >> (22-13)
1188
  pslld XTMP3, (32-18)
1189
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1190
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1191
    xor y2, g   ; y2 = f^g
1192
  psrld XTMP2, 18
1193
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1194
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1195
    and y2, e   ; y2 = (f^g)&e
1196
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1197
  pxor  XTMP1, XTMP3
1198
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1199
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1200
  psrld XTMP4, 3  ; XTMP4 = W[-15] >> 3
1201
    add y2, y0    ; y2 = S1 + CH
1202
    add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1203
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1204
  pxor  XTMP1, XTMP2  ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1205
    mov y0, a   ; y0 = a
1206
    add h, y2   ; h = h + S1 + CH + k + w
1207
    mov y2, a   ; y2 = a
1208
  pxor  XTMP1, XTMP4  ; XTMP1 = s0
1209
    or  y0, c   ; y0 = a|c
1210
    add d, h    ; d = d + h + S1 + CH + k + w
1211
    and y2, c   ; y2 = a&c
1212
  ;; compute low s1
1213
  pshufd  XTMP2, X3, 11111010b  ; XTMP2 = W[-2] {BBAA}
1214
    and y0, b   ; y0 = (a|c)&b
1215
    add h, y1   ; h = h + S1 + CH + k + w + S0
1216
  paddd XTMP0, XTMP1  ; XTMP0 = W[-16] + W[-7] + s0
1217
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1218
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1219
1220
ROTATE_ARGS
1221
  movdqa  XTMP3, XTMP2  ; XTMP3 = W[-2] {BBAA}
1222
    mov y0, e   ; y0 = e
1223
    mov y1, a   ; y1 = a
1224
    ror y0, (25-11) ; y0 = e >> (25-11)
1225
  movdqa  XTMP4, XTMP2  ; XTMP4 = W[-2] {BBAA}
1226
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1227
    ror y1, (22-13) ; y1 = a >> (22-13)
1228
    mov y2, f   ; y2 = f
1229
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1230
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1231
  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1232
    xor y2, g   ; y2 = f^g
1233
  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1234
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1235
    and y2, e   ; y2 = (f^g)&e
1236
  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1237
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1238
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1239
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1240
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1241
  pxor  XTMP2, XTMP3
1242
    add y2, y0    ; y2 = S1 + CH
1243
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1244
    add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1245
  pxor  XTMP4, XTMP2  ; XTMP4 = s1 {xBxA}
1246
    mov y0, a   ; y0 = a
1247
    add h, y2   ; h = h + S1 + CH + k + w
1248
    mov y2, a   ; y2 = a
1249
  pshufb  XTMP4, SHUF_00BA  ; XTMP4 = s1 {00BA}
1250
    or  y0, c   ; y0 = a|c
1251
    add d, h    ; d = d + h + S1 + CH + k + w
1252
    and y2, c   ; y2 = a&c
1253
  paddd XTMP0, XTMP4  ; XTMP0 = {..., ..., W[1], W[0]}
1254
    and y0, b   ; y0 = (a|c)&b
1255
    add h, y1   ; h = h + S1 + CH + k + w + S0
1256
  ;; compute high s1
1257
  pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1258
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1259
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1260
1261
ROTATE_ARGS
1262
  movdqa  XTMP3, XTMP2  ; XTMP3 = W[-2] {DDCC}
1263
    mov y0, e   ; y0 = e
1264
    ror y0, (25-11) ; y0 = e >> (25-11)
1265
    mov y1, a   ; y1 = a
1266
  movdqa  X0,    XTMP2  ; X0    = W[-2] {DDCC}
1267
    ror y1, (22-13) ; y1 = a >> (22-13)
1268
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1269
    mov y2, f   ; y2 = f
1270
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1271
  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1272
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1273
    xor y2, g   ; y2 = f^g
1274
  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1275
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1276
    and y2, e   ; y2 = (f^g)&e
1277
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1278
  psrld X0,    10 ; X0 = W[-2] >> 10 {DDCC}
1279
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1280
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1281
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1282
  pxor  XTMP2, XTMP3
1283
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1284
    add y2, y0    ; y2 = S1 + CH
1285
    add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1286
  pxor  X0, XTMP2 ; X0 = s1 {xDxC}
1287
    mov y0, a   ; y0 = a
1288
    add h, y2   ; h = h + S1 + CH + k + w
1289
    mov y2, a   ; y2 = a
1290
  pshufb  X0, SHUF_DC00 ; X0 = s1 {DC00}
1291
    or  y0, c   ; y0 = a|c
1292
    add d, h    ; d = d + h + S1 + CH + k + w
1293
    and y2, c   ; y2 = a&c
1294
  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1295
    and y0, b   ; y0 = (a|c)&b
1296
    add h, y1   ; h = h + S1 + CH + k + w + S0
1297
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1298
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1299
1300
ROTATE_ARGS
1301
rotate_Xs
1302
%endm
1303
1304
;; input is [rsp + _XFER + %1 * 4]
1305
%macro DO_ROUND 1
1306
    mov y0, e   ; y0 = e
1307
    ror y0, (25-11) ; y0 = e >> (25-11)
1308
    mov y1, a   ; y1 = a
1309
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1310
    ror y1, (22-13) ; y1 = a >> (22-13)
1311
    mov y2, f   ; y2 = f
1312
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1313
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1314
    xor y2, g   ; y2 = f^g
1315
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1316
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1317
    and y2, e   ; y2 = (f^g)&e
1318
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1319
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1320
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1321
    add y2, y0    ; y2 = S1 + CH
1322
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1323
    add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
1324
    mov y0, a   ; y0 = a
1325
    add h, y2   ; h = h + S1 + CH + k + w
1326
    mov y2, a   ; y2 = a
1327
    or  y0, c   ; y0 = a|c
1328
    add d, h    ; d = d + h + S1 + CH + k + w
1329
    and y2, c   ; y2 = a&c
1330
    and y0, b   ; y0 = (a|c)&b
1331
    add h, y1   ; h = h + S1 + CH + k + w + S0
1332
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1333
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1334
    ROTATE_ARGS
1335
%endm
1336
1337
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1339
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1340
;; arg 1 : pointer to input data
1341
;; arg 2 : pointer to digest
1342
;; arg 3 : Num blocks
1343
section .text
1344
global sha256_sse4
1345
align 32
1346
sha256_sse4:
1347
    push  rbx
1348
%ifndef LINUX
1349
    push  rsi
1350
    push  rdi
1351
%endif
1352
    push  rbp
1353
    push  r13
1354
    push  r14
1355
    push  r15
1356
1357
    sub rsp,STACK_SIZE
1358
%ifndef LINUX
1359
    movdqa  [rsp + _XMM_SAVE + 0*16],xmm6 
1360
    movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
1361
    movdqa  [rsp + _XMM_SAVE + 2*16],xmm8 
1362
    movdqa  [rsp + _XMM_SAVE + 3*16],xmm9 
1363
    movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
1364
    movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
1365
    movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
1366
%endif
1367
1368
    shl NUM_BLKS, 6 ; convert to bytes
1369
    jz  done_hash
1370
    add NUM_BLKS, INP ; pointer to end of data
1371
    mov [rsp + _INP_END], NUM_BLKS
1372
1373
    ;; load initial digest
1374
    mov a,[4*0 + CTX]
1375
    mov b,[4*1 + CTX]
1376
    mov c,[4*2 + CTX]
1377
    mov d,[4*3 + CTX]
1378
    mov e,[4*4 + CTX]
1379
    mov f,[4*5 + CTX]
1380
    mov g,[4*6 + CTX]
1381
    mov h,[4*7 + CTX]
1382
1383
    movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1384
    movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
1385
    movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
1386
1387
loop0:
1388
    lea TBL,[K256 wrt rip]
1389
1390
    ;; byte swap first 16 dwords
1391
    COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
1392
    COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
1393
    COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
1394
    COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
1395
    
1396
    mov [rsp + _INP], INP
1397
1398
    ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1399
    mov SRND, 3
1400
align 16
1401
loop1:
1402
    movdqa  XFER, [TBL + 0*16]
1403
    paddd XFER, X0
1404
    movdqa  [rsp + _XFER], XFER
1405
    FOUR_ROUNDS_AND_SCHED
1406
1407
    movdqa  XFER, [TBL + 1*16]
1408
    paddd XFER, X0
1409
    movdqa  [rsp + _XFER], XFER
1410
    FOUR_ROUNDS_AND_SCHED
1411
1412
    movdqa  XFER, [TBL + 2*16]
1413
    paddd XFER, X0
1414
    movdqa  [rsp + _XFER], XFER
1415
    FOUR_ROUNDS_AND_SCHED
1416
1417
    movdqa  XFER, [TBL + 3*16]
1418
    paddd XFER, X0
1419
    movdqa  [rsp + _XFER], XFER
1420
    add TBL, 4*16
1421
    FOUR_ROUNDS_AND_SCHED
1422
1423
    sub SRND, 1
1424
    jne loop1
1425
1426
    mov SRND, 2
1427
loop2:
1428
    paddd X0, [TBL + 0*16]
1429
    movdqa  [rsp + _XFER], X0
1430
    DO_ROUND  0
1431
    DO_ROUND  1
1432
    DO_ROUND  2
1433
    DO_ROUND  3
1434
    paddd X1, [TBL + 1*16]
1435
    movdqa  [rsp + _XFER], X1
1436
    add TBL, 2*16
1437
    DO_ROUND  0
1438
    DO_ROUND  1
1439
    DO_ROUND  2
1440
    DO_ROUND  3
1441
1442
    movdqa  X0, X2
1443
    movdqa  X1, X3
1444
1445
    sub SRND, 1
1446
    jne loop2
1447
1448
    addm  [4*0 + CTX],a
1449
    addm  [4*1 + CTX],b
1450
    addm  [4*2 + CTX],c
1451
    addm  [4*3 + CTX],d
1452
    addm  [4*4 + CTX],e
1453
    addm  [4*5 + CTX],f
1454
    addm  [4*6 + CTX],g
1455
    addm  [4*7 + CTX],h
1456
1457
    mov INP, [rsp + _INP]
1458
    add INP, 64
1459
    cmp INP, [rsp + _INP_END]
1460
    jne loop0
1461
1462
done_hash:
1463
%ifndef LINUX
1464
    movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
1465
    movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
1466
    movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
1467
    movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
1468
    movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
1469
    movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
1470
    movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
1471
%endif
1472
1473
    add rsp, STACK_SIZE
1474
1475
    pop r15
1476
    pop r14
1477
    pop r13
1478
    pop rbp
1479
%ifndef LINUX
1480
    pop rdi
1481
    pop rsi
1482
%endif
1483
    pop rbx
1484
1485
    ret 
1486
    
1487
1488
section .data
1489
align 64
1490
K256:
1491
    dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1492
    dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1493
    dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1494
    dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1495
    dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1496
    dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1497
    dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1498
    dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1499
    dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1500
    dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1501
    dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1502
    dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1503
    dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1504
    dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1505
    dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1506
    dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1507
1508
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1509
1510
; shuffle xBxA -> 00BA
1511
_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1512
1513
; shuffle xDxC -> DC00
1514
_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1515
*/
1516
1517
#endif