calc_addrs.cl


SUBMITTED BY: Guest

DATE: Nov. 15, 2012, 5:58 p.m.

FORMAT: Text only

SIZE: 42.8 kB

HITS: 1875

  1. /*
  2. * Vanitygen, vanity bitcoin address generator
  3. * Copyright (C) 2011 <samr7@cs.washington.edu>
  4. *
  5. * Vanitygen is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as published by
  7. * the Free Software Foundation, either version 3 of the License, or
  8. * any later version.
  9. *
  10. * Vanitygen is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with Vanitygen. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /*
  19. * This file contains an OpenCL kernel for performing certain parts of
  20. * the bitcoin address calculation process.
  21. *
  22. * Kernel: ec_add_grid
  23. *
  24. * Inputs:
  25. * - Row: Array of (sequential) EC points
  26. * - Column: Array of column increment EC points (= rowsize * Pgenerator)
  27. *
  28. * Steps:
  29. * - Compute P = Row[x] + Column[y]
  30. * P is computed as numerator/denominator components Pxj, Pyj, Pz
  31. * Final values are: Px = Pxj / (Pz^2), Py = Pyj / (Pz^3)
  32. *
  33. * The modular inverse of Pz is required to compute Px and Py, and
  34. * can be computed more efficiently in large batches. This is done in
  35. * the next kernel heap_invert.
  36. *
  37. * - Store Pxj, Pyj to intermediate point buffer
  38. * - Store Pz to z_heap
  39. *
  40. * Outputs:
  41. * - Intermediate point buffer
  42. * - Denominator buffer (z_heap)
  43. *
  44. * -------------------------------
  45. * Kernel: heap_invert
  46. *
  47. * Inputs:
  48. * - Denominator buffer (z_heap)
  49. * - N = Batch size (power of 2)
  50. *
  51. * Steps:
  52. * - Compute the product tree for N values in the denominator buffer
  53. * - Compute the modular inverse of the root of the product tree
  54. * - Multiply down the tree to compute the modular inverse of each leaf
  55. *
  56. * Outputs:
  57. * - Modular inverse denominator buffer (z_heap)
  58. *
  59. * -------------------------------
  60. * Kernel: hash_ec_point_get
  61. *
  62. * Inputs:
  63. * - Intermediate point buffer
  64. * - Modular inverse denominator buffer (z_heap)
  65. *
  66. * Steps:
  67. * - Compute Px = Pxj * (1/Pz)^2
  68. * - Compute Py = Pyj * (1/Pz)^3
  69. * - Compute H = RIPEMD160(SHA256(0x04 | Px | Py))
  70. *
  71. * Output:
  72. * - Array of 20-byte address hash values
  73. *
  74. * -------------------------------
  75. * Kernel: hash_ec_point_search_prefix
  76. *
  77. * Like hash_ec_point_get, but instead of storing the complete hash
  78. * value to an output buffer, it searches a sorted list of ranges,
  79. * and if a match is found, writes a flag to an output buffer.
  80. */
  81. /* Byte-swapping and endianness */
  82. #define bswap32(v) \
  83. (((v) >> 24) | (((v) >> 8) & 0xff00) | \
  84. (((v) << 8) & 0xff0000) | ((v) << 24))
  85. #if __ENDIAN_LITTLE__ != 1
  86. #define load_le32(v) bswap32(v)
  87. #define load_be32(v) (v)
  88. #else
  89. #define load_le32(v) (v)
  90. #define load_be32(v) bswap32(v)
  91. #endif
  92. /*
  93. * Loop unrolling macros
  94. *
  95. * In most cases, preprocessor unrolling works best.
  96. * The exception is NVIDIA's compiler, which seems to take unreasonably
  97. * long to compile a loop with a larger iteration count, or a loop with
  98. * a body of >50 PTX instructions, with preprocessor unrolling.
  99. * However, it does not seem to take as long with pragma unroll, and
  100. * produces good output.
  101. */
  102. /* Explicit loop unrolling */
  103. #define unroll_5(a) do { a(0) a(1) a(2) a(3) a(4) } while (0)
  104. #define unroll_8(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
  105. #define unroll_1_7(a) do { a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
  106. #define unroll_7(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) } while (0)
  107. #define unroll_7_0(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) a(0) } while (0)
  108. #define unroll_7_1(a) do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) } while (0)
  109. #define unroll_16(a) do { \
  110. a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
  111. a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15) \
  112. } while (0)
  113. #define unroll_64(a) do { \
  114. a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
  115. a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15) \
  116. a(16) a(17) a(18) a(19) a(20) a(21) a(22) a(23) \
  117. a(24) a(25) a(26) a(27) a(28) a(29) a(30) a(31) \
  118. a(32) a(33) a(34) a(35) a(36) a(37) a(38) a(39) \
  119. a(40) a(41) a(42) a(43) a(44) a(45) a(46) a(47) \
  120. a(48) a(49) a(50) a(51) a(52) a(53) a(54) a(55) \
  121. a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
  122. } while (0)
  123. /* Conditional loop unrolling */
  124. #if defined(DEEP_PREPROC_UNROLL)
  125. #define iter_5(a) unroll_5(a)
  126. #define iter_8(a) unroll_8(a)
  127. #define iter_16(a) unroll_16(a)
  128. #define iter_64(a) unroll_64(a)
  129. #else
  130. #define iter_5(a) do {int _i; for (_i = 0; _i < 5; _i++) { a(_i) }} while (0)
  131. #define iter_8(a) do {int _i; for (_i = 0; _i < 8; _i++) { a(_i) }} while (0)
  132. #define iter_16(a) do {int _i; for (_i = 0; _i < 16; _i++) { a(_i) }} while (0)
  133. #define iter_64(a) do {int _i; for (_i = 0; _i < 64; _i++) { a(_i) }} while (0)
  134. #endif
  135. /*
  136. * BIGNUM mini-library
  137. * This module deals with fixed-size 256-bit bignums.
  138. * Where modular arithmetic is performed, the SECP256k1 prime
  139. * modulus (below) is assumed.
  140. *
  141. * Methods include:
  142. * - bn_is_zero/bn_is_one/bn_is_odd/bn_is_even/bn_is_bit_set
  143. * - bn_rshift[1]/bn_lshift[1]
  144. * - bn_neg
  145. * - bn_uadd/bn_uadd_p
  146. * - bn_usub/bn_usub_p
  147. */
  148. typedef uint bn_word;
  149. #define BN_NBITS 256
  150. #define BN_WSHIFT 5
  151. #define BN_WBITS (1 << BN_WSHIFT)
  152. #define BN_NWORDS ((BN_NBITS/8) / sizeof(bn_word))
  153. #define BN_WORDMAX 0xffffffff
  154. #define MODULUS_BYTES \
  155. 0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, \
  156. 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
  157. typedef struct {
  158. bn_word d[BN_NWORDS];
  159. } bignum;
  160. __constant bn_word modulus[] = { MODULUS_BYTES };
  161. __constant bignum bn_zero;
  162. __constant bn_word mont_rr[BN_NWORDS] = { 0xe90a1, 0x7a2, 0x1, 0, };
  163. __constant bn_word mont_n0[2] = { 0xd2253531, 0xd838091d };
  164. #define bn_is_odd(bn) (bn.d[0] & 1)
  165. #define bn_is_even(bn) (!bn_is_odd(bn))
  166. #define bn_is_zero(bn) (!bn.d[0] && !bn.d[1] && !bn.d[2] && \
  167. !bn.d[3] && !bn.d[4] && !bn.d[5] && \
  168. !bn.d[6] && !bn.d[7])
  169. #define bn_is_one(bn) ((bn.d[0] == 1) && !bn.d[1] && !bn.d[2] && \
  170. !bn.d[3] && !bn.d[4] && !bn.d[5] && \
  171. !bn.d[6] && !bn.d[7])
  172. #define bn_is_bit_set(bn, n) \
  173. ((((bn_word*)&bn)[n >> BN_WSHIFT]) & (1 << (n & (BN_WBITS-1))))
  174. #define bn_unroll(e) unroll_8(e)
  175. #define bn_unroll_sf(e) unroll_1_7(e)
  176. #define bn_unroll_sl(e) unroll_7(e)
  177. #define bn_unroll_reverse(e) unroll_7_0(e)
  178. #define bn_unroll_reverse_sl(e) unroll_7_1(e)
  179. #define bn_unroll_arg(e, arg) \
  180. e(arg, 0) e(arg, 1) e(arg, 2) e(arg, 3) \
  181. e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
  182. #define bn_unroll_arg_sf(e, arg) \
  183. e(arg, 1) e(arg, 2) e(arg, 3) \
  184. e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
  185. #define bn_iter(e) iter_8(e)
  186. /*
  187. * Bitwise shift
  188. */
  189. void
  190. bn_lshift1(bignum *bn)
  191. {
  192. #define bn_lshift1_inner1(i) \
  193. bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
  194. bn_unroll_reverse_sl(bn_lshift1_inner1);
  195. bn->d[0] <<= 1;
  196. }
  197. void
  198. bn_rshift(bignum *bn, int shift)
  199. {
  200. int wd, iws, iwr;
  201. bn_word ihw, ilw;
  202. iws = (shift & (BN_WBITS-1));
  203. iwr = BN_WBITS - iws;
  204. wd = (shift >> BN_WSHIFT);
  205. ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;
  206. #define bn_rshift_inner1(i) \
  207. wd++; \
  208. ilw = ihw; \
  209. ihw = (wd < BN_WBITS) ? bn->d[wd] : 0; \
  210. bn->d[i] = (ilw >> iws) | (ihw << iwr);
  211. bn_unroll_sl(bn_rshift_inner1);
  212. bn->d[BN_NWORDS-1] = (ihw >> iws);
  213. }
  214. void
  215. bn_rshift1(bignum *bn)
  216. {
  217. #define bn_rshift1_inner1(i) \
  218. bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
  219. bn_unroll_sl(bn_rshift1_inner1);
  220. bn->d[BN_NWORDS-1] >>= 1;
  221. }
  222. void
  223. bn_rshift1_2(bignum *bna, bignum *bnb)
  224. {
  225. #define bn_rshift1_2_inner1(i) \
  226. bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1); \
  227. bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
  228. bn_unroll_sl(bn_rshift1_2_inner1);
  229. bna->d[BN_NWORDS-1] >>= 1;
  230. bnb->d[BN_NWORDS-1] >>= 1;
  231. }
  232. /*
  233. * Unsigned comparison
  234. */
  235. int
  236. bn_ucmp_ge(bignum *a, bignum *b)
  237. {
  238. int l = 0, g = 0;
  239. #define bn_ucmp_ge_inner1(i) \
  240. if (a->d[i] < b->d[i]) l |= (1 << i); \
  241. if (a->d[i] > b->d[i]) g |= (1 << i);
  242. bn_unroll_reverse(bn_ucmp_ge_inner1);
  243. return (l > g) ? 0 : 1;
  244. }
  245. int
  246. bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
  247. {
  248. int l = 0, g = 0;
  249. #define bn_ucmp_ge_c_inner1(i) \
  250. if (a->d[i] < b[i]) l |= (1 << i); \
  251. if (a->d[i] > b[i]) g |= (1 << i);
  252. bn_unroll_reverse(bn_ucmp_ge_c_inner1);
  253. return (l > g) ? 0 : 1;
  254. }
  255. /*
  256. * Negate
  257. */
  258. void
  259. bn_neg(bignum *n)
  260. {
  261. int c = 1;
  262. #define bn_neg_inner1(i) \
  263. c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
  264. bn_unroll(bn_neg_inner1);
  265. }
  266. /*
  267. * Add/subtract
  268. */
  269. #define bn_add_word(r, a, b, t, c) do { \
  270. t = a + b; \
  271. c = (t < a) ? 1 : 0; \
  272. r = t; \
  273. } while (0)
  274. #define bn_addc_word(r, a, b, t, c) do { \
  275. t = a + b + c; \
  276. c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0); \
  277. r = t; \
  278. } while (0)
  279. bn_word
  280. bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
  281. {
  282. bn_word t, c = 0;
  283. #define bn_uadd_words_seq_inner1(i) \
  284. bn_addc_word(r[i], a[i], b[i], t, c);
  285. bn_add_word(r[0], a[0], b[0], t, c);
  286. bn_unroll_sf(bn_uadd_words_seq_inner1);
  287. return c;
  288. }
  289. bn_word
  290. bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
  291. {
  292. bn_word t, c = 0;
  293. bn_add_word(r[0], a[0], b[0], t, c);
  294. bn_unroll_sf(bn_uadd_words_seq_inner1);
  295. return c;
  296. }
  297. #define bn_sub_word(r, a, b, t, c) do { \
  298. t = a - b; \
  299. c = (a < b) ? 1 : 0; \
  300. r = t; \
  301. } while (0)
  302. #define bn_subb_word(r, a, b, t, c) do { \
  303. t = a - (b + c); \
  304. c = (!(a) && c) ? 1 : 0; \
  305. c |= (a < b) ? 1 : 0; \
  306. r = t; \
  307. } while (0)
  308. bn_word
  309. bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b)
  310. {
  311. bn_word t, c = 0;
  312. #define bn_usub_words_seq_inner1(i) \
  313. bn_subb_word(r[i], a[i], b[i], t, c);
  314. bn_sub_word(r[0], a[0], b[0], t, c);
  315. bn_unroll_sf(bn_usub_words_seq_inner1);
  316. return c;
  317. }
  318. bn_word
  319. bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
  320. {
  321. bn_word t, c = 0;
  322. bn_sub_word(r[0], a[0], b[0], t, c);
  323. bn_unroll_sf(bn_usub_words_seq_inner1);
  324. return c;
  325. }
  326. /*
  327. * Add/subtract better suited for AMD's VLIW architecture
  328. */
  329. bn_word
  330. bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b)
  331. {
  332. bignum x;
  333. bn_word c = 0, cp = 0;
  334. #define bn_uadd_words_vliw_inner1(i) \
  335. x.d[i] = a[i] + b[i];
  336. #define bn_uadd_words_vliw_inner2(i) \
  337. c |= (a[i] > x.d[i]) ? (1 << i) : 0; \
  338. cp |= (!~x.d[i]) ? (1 << i) : 0;
  339. #define bn_uadd_words_vliw_inner3(i) \
  340. r[i] = x.d[i] + ((c >> i) & 1);
  341. bn_unroll(bn_uadd_words_vliw_inner1);
  342. bn_unroll(bn_uadd_words_vliw_inner2);
  343. c = ((cp + (c << 1)) ^ cp);
  344. r[0] = x.d[0];
  345. bn_unroll_sf(bn_uadd_words_vliw_inner3);
  346. return c >> BN_NWORDS;
  347. }
  348. bn_word
  349. bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
  350. {
  351. bignum x;
  352. bn_word c = 0, cp = 0;
  353. bn_unroll(bn_uadd_words_vliw_inner1);
  354. bn_unroll(bn_uadd_words_vliw_inner2);
  355. c = ((cp + (c << 1)) ^ cp);
  356. r[0] = x.d[0];
  357. bn_unroll_sf(bn_uadd_words_vliw_inner3);
  358. return c >> BN_NWORDS;
  359. }
  360. bn_word
  361. bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b)
  362. {
  363. bignum x;
  364. bn_word c = 0, cp = 0;
  365. #define bn_usub_words_vliw_inner1(i) \
  366. x.d[i] = a[i] - b[i];
  367. #define bn_usub_words_vliw_inner2(i) \
  368. c |= (a[i] < b[i]) ? (1 << i) : 0; \
  369. cp |= (!x.d[i]) ? (1 << i) : 0;
  370. #define bn_usub_words_vliw_inner3(i) \
  371. r[i] = x.d[i] - ((c >> i) & 1);
  372. bn_unroll(bn_usub_words_vliw_inner1);
  373. bn_unroll(bn_usub_words_vliw_inner2);
  374. c = ((cp + (c << 1)) ^ cp);
  375. r[0] = x.d[0];
  376. bn_unroll_sf(bn_usub_words_vliw_inner3);
  377. return c >> BN_NWORDS;
  378. }
  379. bn_word
  380. bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
  381. {
  382. bignum x;
  383. bn_word c = 0, cp = 0;
  384. bn_unroll(bn_usub_words_vliw_inner1);
  385. bn_unroll(bn_usub_words_vliw_inner2);
  386. c = ((cp + (c << 1)) ^ cp);
  387. r[0] = x.d[0];
  388. bn_unroll_sf(bn_usub_words_vliw_inner3);
  389. return c >> BN_NWORDS;
  390. }
  391. #if defined(DEEP_VLIW)
  392. #define bn_uadd_words bn_uadd_words_vliw
  393. #define bn_uadd_words_c bn_uadd_words_c_vliw
  394. #define bn_usub_words bn_usub_words_vliw
  395. #define bn_usub_words_c bn_usub_words_c_vliw
  396. #else
  397. #define bn_uadd_words bn_uadd_words_seq
  398. #define bn_uadd_words_c bn_uadd_words_c_seq
  399. #define bn_usub_words bn_usub_words_seq
  400. #define bn_usub_words_c bn_usub_words_c_seq
  401. #endif
  402. #define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d)
  403. #define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b)
  404. #define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d)
  405. #define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b)
  406. /*
  407. * Modular add/sub
  408. */
  409. void
  410. bn_mod_add(bignum *r, bignum *a, bignum *b)
  411. {
  412. if (bn_uadd(r, a, b) ||
  413. (bn_ucmp_ge_c(r, modulus)))
  414. bn_usub_c(r, r, modulus);
  415. }
  416. void
  417. bn_mod_sub(bignum *r, bignum *a, bignum *b)
  418. {
  419. if (bn_usub(r, a, b))
  420. bn_uadd_c(r, r, modulus);
  421. }
  422. void
  423. bn_mod_lshift1(bignum *bn)
  424. {
  425. bn_word c = (bn->d[BN_NWORDS-1] & 0x80000000);
  426. bn_lshift1(bn);
  427. if (c || (bn_ucmp_ge_c(bn, modulus)))
  428. bn_usub_c(bn, bn, modulus);
  429. }
  430. /*
  431. * Montgomery multiplication
  432. *
  433. * This includes normal multiplication of two "Montgomeryized"
  434. * bignums, and bn_from_mont for de-Montgomeryizing a bignum.
  435. */
  436. #define bn_mul_word(r, a, w, c, p, s) do { \
  437. r = (a * w) + c; \
  438. p = mul_hi(a, w); \
  439. c = (r < c) ? p + 1 : p; \
  440. } while (0)
  441. #define bn_mul_add_word(r, a, w, c, p, s) do { \
  442. s = r + c; \
  443. p = mul_hi(a, w); \
  444. r = (a * w) + s; \
  445. c = (s < c) ? p + 1 : p; \
  446. if (r < s) c++; \
  447. } while (0)
  448. void
  449. bn_mul_mont(bignum *r, bignum *a, bignum *b)
  450. {
  451. bignum t;
  452. bn_word tea, teb, c, p, s, m;
  453. #if !defined(VERY_EXPENSIVE_BRANCHES)
  454. int q;
  455. #endif
  456. c = 0;
  457. #define bn_mul_mont_inner1(j) \
  458. bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
  459. bn_unroll(bn_mul_mont_inner1);
  460. tea = c;
  461. teb = 0;
  462. c = 0;
  463. m = t.d[0] * mont_n0[0];
  464. bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
  465. #define bn_mul_mont_inner2(j) \
  466. bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
  467. t.d[j-1] = t.d[j];
  468. bn_unroll_sf(bn_mul_mont_inner2);
  469. t.d[BN_NWORDS-1] = tea + c;
  470. tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
  471. #define bn_mul_mont_inner3_1(i, j) \
  472. bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
  473. #define bn_mul_mont_inner3_2(i, j) \
  474. bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
  475. t.d[j-1] = t.d[j];
  476. #define bn_mul_mont_inner3(i) \
  477. c = 0; \
  478. bn_unroll_arg(bn_mul_mont_inner3_1, i); \
  479. tea += c; \
  480. teb = ((tea < c) ? 1 : 0); \
  481. c = 0; \
  482. m = t.d[0] * mont_n0[0]; \
  483. bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); \
  484. bn_unroll_arg_sf(bn_mul_mont_inner3_2, i); \
  485. t.d[BN_NWORDS-1] = tea + c; \
  486. tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
  487. /*
  488. * The outer loop here is quite long, and we won't unroll it
  489. * unless VERY_EXPENSIVE_BRANCHES is set.
  490. */
  491. #if defined(VERY_EXPENSIVE_BRANCHES)
  492. bn_unroll_sf(bn_mul_mont_inner3);
  493. c = tea | !bn_usub_c(r, &t, modulus);
  494. if (!c)
  495. *r = t;
  496. #else
  497. for (q = 1; q < BN_NWORDS; q++) {
  498. bn_mul_mont_inner3(q);
  499. }
  500. c = tea || (t.d[BN_NWORDS-1] >= modulus[BN_NWORDS-1]);
  501. if (c) {
  502. c = tea | !bn_usub_c(r, &t, modulus);
  503. if (c)
  504. return;
  505. }
  506. *r = t;
  507. #endif
  508. }
  509. void
  510. bn_from_mont(bignum *rb, bignum *b)
  511. {
  512. #define WORKSIZE ((2*BN_NWORDS) + 1)
  513. bn_word r[WORKSIZE];
  514. bn_word m, c, p, s;
  515. #if defined(PRAGMA_UNROLL)
  516. int i;
  517. #endif
  518. /* Copy the input to the working area */
  519. /* Zero the upper words */
  520. #define bn_from_mont_inner1(i) \
  521. r[i] = b->d[i];
  522. #define bn_from_mont_inner2(i) \
  523. r[BN_NWORDS+i] = 0;
  524. bn_unroll(bn_from_mont_inner1);
  525. bn_unroll(bn_from_mont_inner2);
  526. r[WORKSIZE-1] = 0;
  527. /* Multiply (long) by modulus */
  528. #define bn_from_mont_inner3_1(i, j) \
  529. bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
  530. #if !defined(VERY_EXPENSIVE_BRANCHES)
  531. #define bn_from_mont_inner3_2(i) \
  532. if (r[BN_NWORDS + i] < c) \
  533. r[BN_NWORDS + i + 1] += 1;
  534. #else
  535. #define bn_from_mont_inner3_2(i) \
  536. r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
  537. #endif
  538. #define bn_from_mont_inner3(i) \
  539. m = r[i] * mont_n0[0]; \
  540. c = 0; \
  541. bn_unroll_arg(bn_from_mont_inner3_1, i); \
  542. r[BN_NWORDS + i] += c; \
  543. bn_from_mont_inner3_2(i)
  544. /*
  545. * The outer loop here is not very long, so we will unroll
  546. * it by default. However, it's just complicated enough to
  547. * cause NVIDIA's compiler to take unreasonably long to compile
  548. * it, unless we use pragma unroll.
  549. */
  550. #if !defined(PRAGMA_UNROLL)
  551. bn_iter(bn_from_mont_inner3);
  552. #else
  553. #pragma unroll 8
  554. for (i = 0; i < BN_NWORDS; i++) { bn_from_mont_inner3(i) }
  555. #endif
  556. /*
  557. * Make sure the result is less than the modulus.
  558. * Subtracting is not much more expensive than compare, so
  559. * subtract always and assign based on the carry out value.
  560. */
  561. c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
  562. if (c) {
  563. #define bn_from_mont_inner4(i) \
  564. rb->d[i] = r[BN_NWORDS + i];
  565. bn_unroll(bn_from_mont_inner4);
  566. }
  567. }
  568. /*
  569. * Modular inversion
  570. */
  571. void
  572. bn_mod_inverse(bignum *r, bignum *n)
  573. {
  574. bignum a, b, x, y;
  575. int shift;
  576. bn_word xc, yc;
  577. for (shift = 0; shift < BN_NWORDS; shift++) {
  578. a.d[shift] = modulus[shift];
  579. x.d[shift] = 0;
  580. y.d[shift] = 0;
  581. }
  582. b = *n;
  583. x.d[0] = 1;
  584. xc = 0;
  585. yc = 0;
  586. while (!bn_is_zero(b)) {
  587. shift = 0;
  588. while (!bn_is_odd(b)) {
  589. if (bn_is_odd(x))
  590. xc += bn_uadd_c(&x, &x, modulus);
  591. bn_rshift1_2(&x, &b);
  592. x.d[7] |= (xc << 31);
  593. xc >>= 1;
  594. }
  595. while (!bn_is_odd(a)) {
  596. if (bn_is_odd(y))
  597. yc += bn_uadd_c(&y, &y, modulus);
  598. bn_rshift1_2(&y, &a);
  599. y.d[7] |= (yc << 31);
  600. yc >>= 1;
  601. }
  602. if (bn_ucmp_ge(&b, &a)) {
  603. xc += yc + bn_uadd(&x, &x, &y);
  604. bn_usub(&b, &b, &a);
  605. } else {
  606. yc += xc + bn_uadd(&y, &y, &x);
  607. bn_usub(&a, &a, &b);
  608. }
  609. }
  610. if (!bn_is_one(a)) {
  611. /* no modular inverse */
  612. *r = bn_zero;
  613. } else {
  614. /* Compute y % m as cheaply as possible */
  615. while (yc < 0x80000000)
  616. yc -= bn_usub_c(&y, &y, modulus);
  617. bn_neg(&y);
  618. *r = y;
  619. }
  620. }
  621. /*
  622. * HASH FUNCTIONS
  623. *
  624. * BYTE ORDER NOTE: None of the hash functions below deal with byte
  625. * order. The caller is expected to be aware of this when it stuffs
  626. * data into in the native integer.
  627. *
  628. * NOTE #2: Endianness of the OpenCL device makes no difference here.
  629. */
  630. #define hash256_unroll(a) unroll_8(a)
  631. #define hash160_unroll(a) unroll_5(a)
  632. #define hash256_iter(a) iter_8(a)
  633. #define hash160_iter(a) iter_5(a)
  634. /*
  635. * SHA-2 256
  636. *
  637. * CAUTION: Input buffer will be overwritten/mangled.
  638. * Data expected in big-endian format.
  639. * This implementation is designed for space efficiency more than
  640. * raw speed.
  641. */
  642. __constant uint sha2_init[8] = {
  643. 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
  644. 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
  645. };
  646. __constant uint sha2_k[64] = {
  647. 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  648. 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  649. 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  650. 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  651. 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  652. 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  653. 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  654. 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  655. 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  656. 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  657. 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  658. 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  659. 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  660. 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  661. 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  662. 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  663. };
  664. void
  665. sha2_256_init(uint *out)
  666. {
  667. #define sha2_256_init_inner_1(i) \
  668. out[i] = sha2_init[i];
  669. hash256_unroll(sha2_256_init_inner_1);
  670. }
  671. /* The state variable remapping is really contorted */
  672. #define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
  673. #define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
  674. #define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
  675. #if defined(AMD_BFI_INT)
  676. #pragma OPENCL EXTENSION cl_amd_media_ops : enable
  677. #define sha2_ch(a, b, c) amd_bytealign(a, b, c)
  678. #define sha2_ma(a, b, c) amd_bytealign((a^c), b, a)
  679. #else
  680. #define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
  681. #define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
  682. #endif
  683. void
  684. sha2_256_block(uint *out, uint *in)
  685. {
  686. uint state[8], t1, t2;
  687. #if defined(PRAGMA_UNROLL)
  688. int i;
  689. #endif
  690. #define sha2_256_block_inner_1(i) \
  691. state[i] = out[i];
  692. hash256_unroll(sha2_256_block_inner_1);
  693. #define sha2_256_block_inner_2(i) \
  694. if (i >= 16) { \
  695. t1 = in[(i + 1) % 16]; \
  696. t2 = in[(i + 14) % 16]; \
  697. in[i % 16] += (in[(i + 9) % 16] + \
  698. (rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) + \
  699. (rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10))); \
  700. } \
  701. t1 = (sha2_stvar(state, i, 7) + \
  702. sha2_s1(sha2_stvar(state, i, 4)) + \
  703. sha2_ch(sha2_stvar(state, i, 4), \
  704. sha2_stvar(state, i, 5), \
  705. sha2_stvar(state, i, 6)) + \
  706. sha2_k[i] + \
  707. in[i % 16]); \
  708. t2 = (sha2_s0(sha2_stvar(state, i, 0)) + \
  709. sha2_ma(sha2_stvar(state, i, 0), \
  710. sha2_stvar(state, i, 1), \
  711. sha2_stvar(state, i, 2))); \
  712. sha2_stvar(state, i, 3) += t1; \
  713. sha2_stvar(state, i, 7) = t1 + t2; \
  714. #if !defined(PRAGMA_UNROLL)
  715. iter_64(sha2_256_block_inner_2);
  716. #else
  717. #pragma unroll 64
  718. for (i = 0; i < 64; i++) { sha2_256_block_inner_2(i) }
  719. #endif
  720. #define sha2_256_block_inner_3(i) \
  721. out[i] += state[i];
  722. hash256_unroll(sha2_256_block_inner_3);
  723. }
  724. /*
  725. * RIPEMD160
  726. *
  727. * Data expected in little-endian format.
  728. */
  729. __constant uint ripemd160_iv[] = {
  730. 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
  731. __constant uint ripemd160_k[] = {
  732. 0x00000000, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
  733. __constant uint ripemd160_kp[] = {
  734. 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0x00000000 };
  735. __constant uchar ripemd160_ws[] = {
  736. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  737. 7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
  738. 3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
  739. 1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
  740. 4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
  741. };
  742. __constant uchar ripemd160_wsp[] = {
  743. 5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
  744. 6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
  745. 15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
  746. 8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
  747. 12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
  748. };
  749. __constant uchar ripemd160_rl[] = {
  750. 11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
  751. 7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
  752. 11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
  753. 11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
  754. 9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
  755. };
  756. __constant uchar ripemd160_rlp[] = {
  757. 8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
  758. 9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
  759. 9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
  760. 15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
  761. 8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
  762. };
  763. #define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
  764. #define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
  765. #if defined(AMD_BFI_INT)
  766. #define ripemd160_f0(x, y, z) (x ^ y ^ z)
  767. #define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
  768. #define ripemd160_f2(x, y, z) (z ^ (x | ~y))
  769. #define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
  770. #define ripemd160_f4(x, y, z) (x ^ (y | ~z))
  771. #else
  772. #define ripemd160_f0(x, y, z) (x ^ y ^ z)
  773. #define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
  774. #define ripemd160_f2(x, y, z) (z ^ (x | ~y))
  775. #define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
  776. #define ripemd160_f4(x, y, z) (x ^ (y | ~z))
  777. #endif
  778. #define ripemd160_round(i, in, vals, f, fp, t) do { \
  779. ripemd160_val(vals, i, 0) = \
  780. rotate(ripemd160_val(vals, i, 0) + \
  781. f(ripemd160_val(vals, i, 1), \
  782. ripemd160_val(vals, i, 2), \
  783. ripemd160_val(vals, i, 3)) + \
  784. in[ripemd160_ws[i]] + \
  785. ripemd160_k[i / 16], \
  786. (uint)ripemd160_rl[i]) + \
  787. ripemd160_val(vals, i, 4); \
  788. ripemd160_val(vals, i, 2) = \
  789. rotate(ripemd160_val(vals, i, 2), 10U); \
  790. ripemd160_valp(vals, i, 0) = \
  791. rotate(ripemd160_valp(vals, i, 0) + \
  792. fp(ripemd160_valp(vals, i, 1), \
  793. ripemd160_valp(vals, i, 2), \
  794. ripemd160_valp(vals, i, 3)) + \
  795. in[ripemd160_wsp[i]] + \
  796. ripemd160_kp[i / 16], \
  797. (uint)ripemd160_rlp[i]) + \
  798. ripemd160_valp(vals, i, 4); \
  799. ripemd160_valp(vals, i, 2) = \
  800. rotate(ripemd160_valp(vals, i, 2), 10U); \
  801. } while (0)
  802. void
  803. ripemd160_init(uint *out)
  804. {
  805. #define ripemd160_init_inner_1(i) \
  806. out[i] = ripemd160_iv[i];
  807. hash160_unroll(ripemd160_init_inner_1);
  808. }
  809. void
  810. ripemd160_block(uint *out, uint *in)
  811. {
  812. uint vals[10], t;
  813. #if defined(PRAGMA_UNROLL)
  814. int i;
  815. #endif
  816. #define ripemd160_block_inner_1(i) \
  817. vals[i] = vals[i + 5] = out[i];
  818. hash160_unroll(ripemd160_block_inner_1);
  819. #define ripemd160_block_inner_p0(i) \
  820. ripemd160_round(i, in, vals, \
  821. ripemd160_f0, ripemd160_f4, t);
  822. #define ripemd160_block_inner_p1(i) \
  823. ripemd160_round((16 + i), in, vals, \
  824. ripemd160_f1, ripemd160_f3, t);
  825. #define ripemd160_block_inner_p2(i) \
  826. ripemd160_round((32 + i), in, vals, \
  827. ripemd160_f2, ripemd160_f2, t);
  828. #define ripemd160_block_inner_p3(i) \
  829. ripemd160_round((48 + i), in, vals, \
  830. ripemd160_f3, ripemd160_f1, t);
  831. #define ripemd160_block_inner_p4(i) \
  832. ripemd160_round((64 + i), in, vals, \
  833. ripemd160_f4, ripemd160_f0, t);
  834. #if !defined(PRAGMA_UNROLL)
  835. iter_16(ripemd160_block_inner_p0);
  836. iter_16(ripemd160_block_inner_p1);
  837. iter_16(ripemd160_block_inner_p2);
  838. iter_16(ripemd160_block_inner_p3);
  839. iter_16(ripemd160_block_inner_p4);
  840. #else
  841. #pragma unroll 16
  842. for (i = 0; i < 16; i++) { ripemd160_block_inner_p0(i); }
  843. #pragma unroll 16
  844. for (i = 0; i < 16; i++) { ripemd160_block_inner_p1(i); }
  845. #pragma unroll 16
  846. for (i = 0; i < 16; i++) { ripemd160_block_inner_p2(i); }
  847. #pragma unroll 16
  848. for (i = 0; i < 16; i++) { ripemd160_block_inner_p3(i); }
  849. #pragma unroll 16
  850. for (i = 0; i < 16; i++) { ripemd160_block_inner_p4(i); }
  851. #endif
  852. t = out[1] + vals[2] + vals[8];
  853. out[1] = out[2] + vals[3] + vals[9];
  854. out[2] = out[3] + vals[4] + vals[5];
  855. out[3] = out[4] + vals[0] + vals[6];
  856. out[4] = out[0] + vals[1] + vals[7];
  857. out[0] = t;
  858. }
  859. #ifdef TEST_KERNELS
  860. /*
  861. * Test kernels
  862. */
  863. /* Montgomery multiplication test kernel */
  864. __kernel void
  865. test_mul_mont(__global bignum *products_out, __global bignum *nums_in)
  866. {
  867. bignum a, b, c;
  868. int o;
  869. o = get_global_id(0);
  870. nums_in += (2*o);
  871. a = nums_in[0];
  872. b = nums_in[1];
  873. bn_mul_mont(&c, &a, &b);
  874. products_out[o] = c;
  875. }
  876. /* modular inversion test kernel */
  877. __kernel void
  878. test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
  879. int count)
  880. {
  881. bignum x, xp;
  882. int i, o;
  883. o = get_global_id(0) * count;
  884. for (i = 0; i < count; i++) {
  885. x = nums_in[o];
  886. bn_mod_inverse(&xp, &x);
  887. inv_out[o++] = xp;
  888. }
  889. }
  890. #endif /* TEST_KERNELS */
  891. #define ACCESS_BUNDLE 1024
  892. #define ACCESS_STRIDE (ACCESS_BUNDLE/BN_NWORDS)
  893. __kernel void
  894. ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
  895. __global bn_word *row_in, __global bignum *col_in)
  896. {
  897. bignum rx, ry;
  898. bignum x1, y1, a, b, c, d, e, z;
  899. bn_word cy;
  900. int i, cell, start;
  901. /* Load the row increment point */
  902. i = 2 * get_global_id(1);
  903. rx = col_in[i];
  904. ry = col_in[i+1];
  905. cell = get_global_id(0);
  906. start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  907. (cell % (ACCESS_STRIDE/2)));
  908. #define ec_add_grid_inner_1(i) \
  909. x1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
  910. bn_unroll(ec_add_grid_inner_1);
  911. start += (ACCESS_STRIDE/2);
  912. #define ec_add_grid_inner_2(i) \
  913. y1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
  914. bn_unroll(ec_add_grid_inner_2);
  915. bn_mod_sub(&z, &x1, &rx);
  916. cell += (get_global_id(1) * get_global_size(0));
  917. start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  918. (cell % ACCESS_STRIDE));
  919. #define ec_add_grid_inner_3(i) \
  920. z_heap[start + (i*ACCESS_STRIDE)] = z.d[i];
  921. bn_unroll(ec_add_grid_inner_3);
  922. bn_mod_sub(&b, &y1, &ry);
  923. bn_mod_add(&c, &x1, &rx);
  924. bn_mod_add(&d, &y1, &ry);
  925. bn_mul_mont(&y1, &b, &b);
  926. bn_mul_mont(&x1, &z, &z);
  927. bn_mul_mont(&e, &c, &x1);
  928. bn_mod_sub(&y1, &y1, &e);
  929. /*
  930. * This disgusting code caters to the global memory unit on
  931. * various GPUs, by giving it a nice contiguous patch to write
  932. * per warp/wavefront.
  933. */
  934. start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  935. (cell % (ACCESS_STRIDE/2)));
  936. #define ec_add_grid_inner_4(i) \
  937. points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
  938. bn_unroll(ec_add_grid_inner_4);
  939. bn_mod_lshift1(&y1);
  940. bn_mod_sub(&y1, &e, &y1);
  941. bn_mul_mont(&y1, &y1, &b);
  942. bn_mul_mont(&a, &x1, &z);
  943. bn_mul_mont(&c, &d, &a);
  944. bn_mod_sub(&y1, &y1, &c);
  945. cy = 0;
  946. if (bn_is_odd(y1))
  947. cy = bn_uadd_c(&y1, &y1, modulus);
  948. bn_rshift1(&y1);
  949. y1.d[BN_NWORDS-1] |= (cy ? 0x80000000 : 0);
  950. start += (ACCESS_STRIDE/2);
  951. bn_unroll(ec_add_grid_inner_4);
  952. }
  953. __kernel void
  954. heap_invert(__global bn_word *z_heap, int batch)
  955. {
  956. bignum a, b, c, z;
  957. int i, off, lcell, hcell, start;
  958. #define heap_invert_inner_load_a(j) \
  959. a.d[j] = z_heap[start + j*ACCESS_STRIDE];
  960. #define heap_invert_inner_load_b(j) \
  961. b.d[j] = z_heap[start + j*ACCESS_STRIDE];
  962. #define heap_invert_inner_load_z(j) \
  963. z.d[j] = z_heap[start + j*ACCESS_STRIDE];
  964. #define heap_invert_inner_store_z(j) \
  965. z_heap[start + j*ACCESS_STRIDE] = z.d[j];
  966. #define heap_invert_inner_store_c(j) \
  967. z_heap[start + j*ACCESS_STRIDE] = c.d[j];
  968. off = get_global_size(0);
  969. lcell = get_global_id(0);
  970. hcell = (off * batch) + lcell;
  971. for (i = 0; i < (batch-1); i++) {
  972. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  973. (lcell % ACCESS_STRIDE));
  974. bn_unroll(heap_invert_inner_load_a);
  975. lcell += off;
  976. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  977. (lcell % ACCESS_STRIDE));
  978. bn_unroll(heap_invert_inner_load_b);
  979. bn_mul_mont(&z, &a, &b);
  980. start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  981. (hcell % ACCESS_STRIDE));
  982. bn_unroll(heap_invert_inner_store_z);
  983. lcell += off;
  984. hcell += off;
  985. }
  986. /* Invert the root, fix up 1/ZR -> R/Z */
  987. bn_mod_inverse(&z, &z);
  988. #define heap_invert_inner_1(i) \
  989. a.d[i] = mont_rr[i];
  990. bn_unroll(heap_invert_inner_1);
  991. bn_mul_mont(&z, &z, &a);
  992. bn_mul_mont(&z, &z, &a);
  993. /* Unroll the first iteration to avoid a load/store on the root */
  994. lcell -= (off << 1);
  995. hcell -= (off << 1);
  996. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  997. (lcell % ACCESS_STRIDE));
  998. bn_unroll(heap_invert_inner_load_a);
  999. lcell += off;
  1000. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1001. (lcell % ACCESS_STRIDE));
  1002. bn_unroll(heap_invert_inner_load_b);
  1003. bn_mul_mont(&c, &a, &z);
  1004. bn_unroll(heap_invert_inner_store_c);
  1005. bn_mul_mont(&c, &b, &z);
  1006. lcell -= off;
  1007. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1008. (lcell % ACCESS_STRIDE));
  1009. bn_unroll(heap_invert_inner_store_c);
  1010. lcell -= (off << 1);
  1011. for (i = 0; i < (batch-2); i++) {
  1012. start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1013. (hcell % ACCESS_STRIDE));
  1014. bn_unroll(heap_invert_inner_load_z);
  1015. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1016. (lcell % ACCESS_STRIDE));
  1017. bn_unroll(heap_invert_inner_load_a);
  1018. lcell += off;
  1019. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1020. (lcell % ACCESS_STRIDE));
  1021. bn_unroll(heap_invert_inner_load_b);
  1022. bn_mul_mont(&c, &a, &z);
  1023. bn_unroll(heap_invert_inner_store_c);
  1024. bn_mul_mont(&c, &b, &z);
  1025. lcell -= off;
  1026. start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1027. (lcell % ACCESS_STRIDE));
  1028. bn_unroll(heap_invert_inner_store_c);
  1029. lcell -= (off << 1);
  1030. hcell -= off;
  1031. }
  1032. }
  1033. void
  1034. hash_ec_point(uint *hash_out, uint *chash_out, __global bn_word *xy, __global bn_word *zip)
  1035. {
  1036. uint hash1[16], hash2[16], hash3[16], hash4[16];
  1037. bignum c, zi, zzi;
  1038. bn_word wh, wl;
  1039. /*
  1040. * Multiply the coordinates by the inverted Z values.
  1041. * Stash the coordinates in the hash buffer.
  1042. * SHA-2 requires big endian, and our intended hash input
  1043. * is big-endian, so swapping is unnecessary, but
  1044. * inserting the format byte in front causes a headache.
  1045. */
  1046. #define hash_ec_point_inner_1(i) \
  1047. zi.d[i] = zip[i*ACCESS_STRIDE];
  1048. bn_unroll(hash_ec_point_inner_1);
  1049. bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */
  1050. #define hash_ec_point_inner_2(i) \
  1051. c.d[i] = xy[i*ACCESS_STRIDE];
  1052. bn_unroll(hash_ec_point_inner_2);
  1053. bn_mul_mont(&c, &c, &zzi); /* X / Z^2 */
  1054. bn_from_mont(&c, &c);
  1055. wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
  1056. #define hash_ec_point_inner_3(i) \
  1057. wl = wh; \
  1058. wh = c.d[(BN_NWORDS - 1) - i]; \
  1059. hash1[i] = (wl << 24) | (wh >> 8);
  1060. bn_unroll(hash_ec_point_inner_3);
  1061. bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
  1062. #define hash_ec_point_inner_4(i) \
  1063. c.d[i] = xy[(ACCESS_STRIDE/2) + i*ACCESS_STRIDE];
  1064. bn_unroll(hash_ec_point_inner_4);
  1065. bn_mul_mont(&c, &c, &zzi); /* Y / Z^3 */
  1066. bn_from_mont(&c, &c);
  1067. #define hash_ec_point_inner_5(i) \
  1068. wl = wh; \
  1069. wh = c.d[(BN_NWORDS - 1) - i]; \
  1070. hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
  1071. bn_unroll(hash_ec_point_inner_5);
  1072. hash4[0] = hash1[0] ^ 0x06000000;
  1073. if(wh & 0x01){ hash4[0] ^= 0x01000000; }
  1074. hash4[1] = hash1[1];
  1075. hash4[2] = hash1[2];
  1076. hash4[3] = hash1[3];
  1077. hash4[4] = hash1[4];
  1078. hash4[5] = hash1[5];
  1079. hash4[6] = hash1[6];
  1080. hash4[7] = hash1[7];
  1081. hash4[8] = (hash1[8] & 0xff000000) | 0x800000;
  1082. hash4[9] = 0;
  1083. hash4[10] = 0;
  1084. hash4[11] = 0;
  1085. hash4[12] = 0;
  1086. hash4[13] = 0;
  1087. hash4[14] = 0;
  1088. hash4[15] = 33 * 8;
  1089. sha2_256_init(hash3);
  1090. sha2_256_block(hash3, hash4);
  1091. /*
  1092. * Hash the first 64 bytes of the buffer
  1093. */
  1094. sha2_256_init(hash2);
  1095. sha2_256_block(hash2, hash1);
  1096. /*
  1097. * Hash the last byte of the buffer + SHA-2 padding
  1098. */
  1099. hash1[0] = wh << 24 | 0x800000;
  1100. hash1[1] = 0;
  1101. hash1[2] = 0;
  1102. hash1[3] = 0;
  1103. hash1[4] = 0;
  1104. hash1[5] = 0;
  1105. hash1[6] = 0;
  1106. hash1[7] = 0;
  1107. hash1[8] = 0;
  1108. hash1[9] = 0;
  1109. hash1[10] = 0;
  1110. hash1[11] = 0;
  1111. hash1[12] = 0;
  1112. hash1[13] = 0;
  1113. hash1[14] = 0;
  1114. hash1[15] = 65 * 8;
  1115. sha2_256_block(hash2, hash1);
  1116. /*
  1117. * Hash the SHA-2 result with RIPEMD160
  1118. * Unfortunately, SHA-2 outputs big-endian, but
  1119. * RIPEMD160 expects little-endian. Need to swap!
  1120. */
  1121. #define hash_ec_point_inner_6(i) \
  1122. hash2[i] = bswap32(hash2[i]);
  1123. hash256_unroll(hash_ec_point_inner_6);
  1124. #define chash_ec_point_inner_6(i) \
  1125. hash3[i] = bswap32(hash3[i]);
  1126. hash256_unroll(chash_ec_point_inner_6);
  1127. hash2[8] = bswap32(0x80000000);
  1128. hash2[9] = 0;
  1129. hash2[10] = 0;
  1130. hash2[11] = 0;
  1131. hash2[12] = 0;
  1132. hash2[13] = 0;
  1133. hash2[14] = 32 * 8;
  1134. hash2[15] = 0;
  1135. ripemd160_init(hash_out);
  1136. ripemd160_block(hash_out, hash2);
  1137. hash3[8] = bswap32(0x80000000);
  1138. hash3[9] = 0;
  1139. hash3[10] = 0;
  1140. hash3[11] = 0;
  1141. hash3[12] = 0;
  1142. hash3[13] = 0;
  1143. hash3[14] = 32 * 8;
  1144. hash3[15] = 0;
  1145. ripemd160_init(chash_out);
  1146. ripemd160_block(chash_out, hash3);
  1147. }
  1148. __kernel void
  1149. hash_ec_point_get(__global uint *hashes_out,
  1150. __global bn_word *points_in, __global bn_word *z_heap)
  1151. {
  1152. uint hash[5], chash[5];
  1153. int i, p, cell, start;
  1154. cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
  1155. start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1156. (cell % ACCESS_STRIDE));
  1157. z_heap += start;
  1158. start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1159. (cell % (ACCESS_STRIDE/2)));
  1160. points_in += start;
  1161. /* Complete the coordinates and hash */
  1162. hash_ec_point(hash, chash, points_in, z_heap);
  1163. p = get_global_size(0);
  1164. i = p * get_global_id(1);
  1165. hashes_out += 5 * (i + get_global_id(0));
  1166. /* Output the hash in proper byte-order */
  1167. #define hash_ec_point_get_inner_1(i) \
  1168. hashes_out[i] = load_le32(hash[i]);
  1169. hash160_unroll(hash_ec_point_get_inner_1);
  1170. }
  1171. /*
  1172. * Normally this would be one function that compared two hash160s.
  1173. * This one compares a hash160 with an upper and lower bound in one
  1174. * function to work around a problem with AMD's OpenCL compiler.
  1175. */
  1176. int
  1177. hash160_ucmp_g(uint *a, __global uint *bound)
  1178. {
  1179. uint gv;
  1180. #define hash160_ucmp_g_inner_1(i) \
  1181. gv = load_be32(bound[i]); \
  1182. if (a[i] < gv) return -1; \
  1183. if (a[i] > gv) break;
  1184. hash160_iter(hash160_ucmp_g_inner_1);
  1185. #define hash160_ucmp_g_inner_2(i) \
  1186. gv = load_be32(bound[5+i]); \
  1187. if (a[i] < gv) return 0; \
  1188. if (a[i] > gv) return 1;
  1189. hash160_iter(hash160_ucmp_g_inner_2);
  1190. return 0;
  1191. }
  1192. __kernel void
  1193. hash_ec_point_search_prefix(__global uint *found,
  1194. __global bn_word *points_in,
  1195. __global bn_word *z_heap,
  1196. __global uint *target_table, int ntargets)
  1197. {
  1198. uint hash[5], chash[5];
  1199. int i, high, low, p, cell, start;
  1200. cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
  1201. start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1202. (cell % ACCESS_STRIDE));
  1203. z_heap += start;
  1204. start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
  1205. (cell % (ACCESS_STRIDE/2)));
  1206. points_in += start;
  1207. /* Complete the coordinates and hash */
  1208. hash_ec_point(hash, chash, points_in, z_heap);
  1209. /*
  1210. * Unconditionally byteswap the hash result, because:
  1211. * - The byte-level convention of RIPEMD160 is little-endian
  1212. * - We are comparing it in big-endian order
  1213. */
  1214. #define hash_ec_point_search_prefix_inner_1(i) \
  1215. hash[i] = bswap32(hash[i]);
  1216. hash160_unroll(hash_ec_point_search_prefix_inner_1);
  1217. /* Binary-search the target table for the hash we just computed */
  1218. for (high = ntargets - 1, low = 0, i = high >> 1;
  1219. high >= low;
  1220. i = low + ((high - low) >> 1)) {
  1221. p = hash160_ucmp_g(hash, &target_table[10*i]);
  1222. low = (p > 0) ? (i + 1) : low;
  1223. high = (p < 0) ? (i - 1) : high;
  1224. if (p == 0) {
  1225. /* For debugging purposes, write the hash value */
  1226. found[0] = ((get_global_id(1) * get_global_size(0)) +
  1227. get_global_id(0));
  1228. found[1] = i;
  1229. #define hash_ec_point_search_prefix_inner_2(i) \
  1230. found[i+2] = load_be32(hash[i]);
  1231. hash160_unroll(hash_ec_point_search_prefix_inner_2);
  1232. high = -1;
  1233. }
  1234. }
  1235. #define chash_ec_point_search_prefix_inner_1(i) \
  1236. chash[i] = bswap32(chash[i]);
  1237. hash160_unroll(chash_ec_point_search_prefix_inner_1);
  1238. /* Binary-search the target table for the hash we just computed */
  1239. for (high = ntargets - 1, low = 0, i = high >> 1;
  1240. high >= low;
  1241. i = low + ((high - low) >> 1)) {
  1242. p = hash160_ucmp_g(chash, &target_table[10*i]);
  1243. low = (p > 0) ? (i + 1) : low;
  1244. high = (p < 0) ? (i - 1) : high;
  1245. if (p == 0) {
  1246. /* For debugging purposes, write the hash value */
  1247. found[0] = ((get_global_id(1) * get_global_size(0)) +
  1248. get_global_id(0));
  1249. found[1] = i;
  1250. #define chash_ec_point_search_prefix_inner_2(i) \
  1251. found[i+2] = load_be32(chash[i]);
  1252. hash160_unroll(chash_ec_point_search_prefix_inner_2);
  1253. high = -1;
  1254. }
  1255. }
  1256. }

comments powered by Disqus