PageRenderTime 48ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/cln-1.3.2/src/base/digitseq/cl_DS_mul_fftp.h

#
C Header | 823 lines | 667 code | 25 blank | 131 comment | 120 complexity | 3b134dbc26829ce8778b6d29ebb5a2b5 MD5 | raw file
Possible License(s): GPL-2.0
  1. // Fast integer multiplication using FFT in a modular ring.
  2. // Bruno Haible 5.5.1996
  3. // FFT in the complex domain has the drawback that it needs careful round-off
  4. // error analysis. So here we choose another field of characteristic 0: Q_p.
  5. // Since Q_p contains exactly the (p-1)th roots of unity, we choose
  6. // p == 1 mod N and have the Nth roots of unity (N = 2^n) in Q_p and
  7. // even in Z_p. Actually, we compute in Z/(p^m Z).
  8. // All operations the FFT algorithm needs is addition, subtraction,
  9. // multiplication, multiplication by the Nth root of unity and division
  10. // by N. Hence we can use the domain Z/(p^m Z) even if p is not a prime!
  11. // We want to compute the convolution of N 32-bit words. The resulting
  12. // words are < (2^32)^2 * N. If is safe to compute in Z/pZ with p = 2^94 + 1
  13. // or p = 7*2^92 + 1. We choose p < 2^95 so that we can easily represent every
  14. // element of Z/pZ as three 32-bit words.
  15. #if !(intDsize==32)
  16. #error "fft mod p implemented only for intDsize==32"
  17. #endif
  18. #if 0
  19. typedef union {
  20. #if CL_DS_BIG_ENDIAN_P
  21. struct { uint32 w2; uint32 w1; uint32 w0; };
  22. #else
  23. struct { uint32 w0; uint32 w1; uint32 w2; };
  24. #endif
  25. uintD _w[3];
  26. } fftp_word;
  27. #else
  28. // typedef struct { uint32 w2; uint32 w1; uint32 w0; } fftp_word;
  29. // typedef struct { uint32 w0; uint32 w1; uint32 w2; } fftp_word;
  30. typedef struct { uintD _w[3]; } fftp_word;
  31. #endif
  32. #if CL_DS_BIG_ENDIAN_P
  33. #define w2 _w[0]
  34. #define w1 _w[1]
  35. #define w0 _w[2]
  36. #define W3(W2,W1,W0) { W2, W1, W0 }
  37. #else
  38. #define w0 _w[0]
  39. #define w1 _w[1]
  40. #define w2 _w[2]
  41. #define W3(W2,W1,W0) { W0, W1, W2 }
  42. #endif
  43. #if 0
  44. // p = 19807040628566084398385987585 = 5 * 3761 * 7484047069 * 140737471578113
  45. static const fftp_word p = W3( 1L<<30, 0, 1 ); // p = 2^94 + 1
  46. #define FFT_P_94
  47. static const fftp_word fftp_roots_of_1 [24+1] =
  48. // roots_of_1[n] is a (2^n)th root of unity in Z/pZ.
  49. // (Also roots_of_1[n-1] = roots_of_1[n]^2, but we don't need this.)
  50. // (To build this table, you need to compute roots of unity modulo the
  51. // factors of p and combine them using the Chinese Remainder Theorem.
  52. // Or ask me for "quadmod.lsp".)
  53. {
  54. W3( 0x00000000, 0x00000000, 0x00000001 ), // 1
  55. W3( 0x0000003F, 0xFFFFFFFF, 0xFF800000 ), // 1180591620717402914816
  56. W3( 0x20000040, 0x00004000, 0x00000001 ), // 9903521494874733285348474881
  57. W3( 0x3688E9A7, 0xDD78E2A9, 0x1E75974D ), // 16877707849775746711303853901
  58. W3( 0x286E6589, 0x5E86C1E0, 0x42710379 ), // 12512861726041464545960067961
  59. W3( 0x00D79325, 0x1A884885, 0xEA46D6C5 ), // 260613923531515619478787781
  60. W3( 0x1950B480, 0xC387CEE5, 0xA69C443F ), // 7834691712342412468047070271
  61. W3( 0x19DC9D08, 0x11CADC6A, 0x5BA8B123 ), // 8003830486242687653832601891
  62. W3( 0x21D6D905, 0xB8BAC7C3, 0xC3841613 ), // 10472740308573592285123712531
  63. W3( 0x27D73986, 0x6AF6BD27, 0x7A6D7909 ), // 12330106088710388189231937801
  64. W3( 0x20D4698B, 0x0039D457, 0xA092AECF ), // 10160311000635748689099534031
  65. W3( 0x049BD1C4, 0xA94F001A, 0xFA76E358 ), // 1426314143682376031341568856
  66. W3( 0x26DD7228, 0x09400257, 0x9BB49CB9 ), // 12028142067661291067236719801
  67. W3( 0x12DAA9AD, 0xAF9435A9, 0xD50FF483 ), // 5835077289334326375656453251
  68. W3( 0x0B7CDA03, 0x9418702E, 0x7CD934CA ), // 3555271451571910239441204426
  69. W3( 0x2D272FCF, 0xB8644522, 0x68EAD40B ), // 13974199331913037576372147211
  70. W3( 0x00EDA06E, 0x0114DA26, 0xE8D84BA9 ), // 287273027105701319912475561
  71. W3( 0x2219C2C4, 0xFD3145C6, 0xDD019359 ), // 10553633252320053510122083161
  72. W3( 0x1764F007, 0x4F5D5FD4, 0xDAB10AFC ), // 7240181310654329198595869436
  73. W3( 0x01AA13EE, 0x2D1CD906, 0x11D5B1EB ), // 515096517694807745704079851
  74. W3( 0x27038944, 0x5A37BAAD, 0x5CECA64C ), // 12074190385578921562318087756
  75. W3( 0x2459CF22, 0xF625FD38, 0xADB48511 ), // 11250032926302238120667809041
  76. W3( 0x25B6C6A8, 0xD684063F, 0x7ABAD1EF ), // 11671908005633729316324561391
  77. W3( 0x1C1A2BC6, 0x12B253F1, 0x0D1BBCB7 ), // 8697219061868963805380983991
  78. W3( 0x198F3FE2, 0x5EE9919F, 0x535E80D5 } // 7910303322630257758732976341
  79. };
  80. // Sadly, this p doesn't work because we don't find a (2^n)th root of unity w
  81. // such that w^(2^(n-1)) = -1 mod p. However, our algorithm below assumes
  82. // that w^(2^(n-1)) = -1...
  83. #else
  84. // p = 34662321099990647697175478273, a prime
  85. static const fftp_word p = W3( 7L<<28, 0, 1 ); // p = 7 * 2^92 + 1
  86. #define FFT_P_92
  87. static const fftp_word fftp_roots_of_1 [92+1] =
  88. // roots_of_1[n] is a (2^n)th root of unity in Z/pZ.
  89. // (Also roots_of_1[n-1] = roots_of_1[n]^2, but we don't need this.)
  90. {
  91. W3( 0x00000000, 0x00000000, 0x00000001 ), // 1
  92. W3( 0x70000000, 0x00000000, 0x00000000 ), // 34662321099990647697175478272
  93. W3( 0x064AF70F, 0x997E62CE, 0x77953100 ), // 1947537281862369253065568512
  94. W3( 0x261B8E96, 0xC3AD4296, 0xDA1BFA93 ), // 11793744727492885369350519443
  95. W3( 0x096EA949, 0x6EDCAF05, 0x47C92A4F ), // 2919146363086089454841571919
  96. W3( 0x366A8C3F, 0x7BF1436D, 0x2333BE9E ), // 16840998969615256469762195102
  97. W3( 0x27569FA8, 0xAE1775F1, 0xB21956A0 ), // 12174636971387721414084220576
  98. W3( 0x16CABB8B, 0xBAA59813, 0x62FCBCD9 ), // 7053758891710792545762852057
  99. W3( 0x1AE130A3, 0xF909B101, 0xB6BA30CF ), // 8318848263123793919933558991
  100. W3( 0x32AE8FEE, 0x6B1A656B, 0xED02BF24 ), // 15685283280129931240441823012
  101. W3( 0x2D1EE047, 0x5AEDC882, 0x8E96BCCC ), // 13964152342912072497719852236
  102. W3( 0x222A18FD, 0x3BF40635, 0xBFDEA8AD ), // 10573383226491471052459124909
  103. W3( 0x10534EE6, 0xED5A55D4, 0x06AE2155 ), // 5052473604609413010647032149
  104. W3( 0x02F3BFA3, 0x2D816786, 0xE6C27B3C ), // 913643975905572976593107772
  105. W3( 0x0B0CD0A5, 0x9A1FF4F7, 0x2624A5E1 ), // 3419827524917244902802499041
  106. W3( 0x257A492F, 0x156C141C, 0xFC5D75F4 ), // 11598779914676604137587439092
  107. W3( 0x061FB92A, 0xB1A1F41A, 0x7006920F ), // 1895261184698485907279745551
  108. W3( 0x2A4E1471, 0xDDB96073, 0xD8DDBB71 ), // 13092763174215078887900953457
  109. W3( 0x213B469E, 0xD72A84CA, 0xAAA477F2 ), // 10284665443205365583657072626
  110. W3( 0x1D7EF67C, 0x3DC2DA37, 0x4C86E9DC ), // 9128553932091860654576036316
  111. W3( 0x0CB7AA67, 0x2E087ED8, 0x2675D6E3 ), // 3935858248479385987820410595
  112. W3( 0x00BD7B24, 0x68388052, 0x57FFFB10 ), // 229068502577238003716979472
  113. W3( 0x1E1724A6, 0xBA587C3D, 0x0C12825B ), // 9312528669272006966417457755
  114. W3( 0x20595EF0, 0xC89DA33B, 0x3CB5583B ), // 10011563056352601486430394427
  115. W3( 0x15E730B2, 0x6D34E9EB, 0x71CCE555 ), // 6778677035560020292206912853
  116. W3( 0x015EFDBB, 0xC0A80C3B, 0xE4B1E017 ), // 424322259008787317821399063
  117. W3( 0x1B81FC63, 0x0C694944, 0x8EB481BF ), // 8513238559382277026756198847
  118. W3( 0x1AF53421, 0x5DCAA1A4, 0xD0C15A03 ), // 8343043259718611508685527555
  119. W3( 0x2F2B6B58, 0xBB60E464, 0x37A7DE2E ), // 14598286201875835624993840686
  120. W3( 0x27B4AB13, 0x54617640, 0xE86E757A ), // 12288329911800070034603013498
  121. W3( 0x041A31D2, 0xF0AC8E3C, 0x8AA4FD27 ), // 1269607397711669380834983207
  122. W3( 0x1A52F484, 0x39AC5917, 0x34E3F1F7 ), // 8146896869111203814625767927
  123. W3( 0x048FC120, 0x50F6ECBF, 0x268D86A8 ), // 1411728444351387120148776616
  124. W3( 0x27A2C427, 0x001F1239, 0x93380047 ), // 12266687669072434694473646151
  125. W3( 0x2E7E8DFB, 0x2411A754, 0xE12A9B1D ), // 14389305591459206001391737629
  126. W3( 0x29F14702, 0x40B3E1E2, 0xF7D71A8D ), // 12980571854778363745245010573
  127. W3( 0x3158DCE7, 0x8B8FEB32, 0x1DE35D24 ), // 15272194145252623177165790500
  128. W3( 0x12484C07, 0x437ED373, 0x9E45F602 ), // 5658131869639928287764805122
  129. W3( 0x1AEAE06E, 0xB905C908, 0x4389BF5F ), // 8330558749711089231534341983
  130. W3( 0x27BC0045, 0x43024FEB, 0xEC880258 ), // 12297194714773858676269122136
  131. W3( 0x2EFE1CBC, 0x0D2FAA94, 0xB4EA69A6 ), // 14543513305163560781242591654
  132. W3( 0x0B0D3D8B, 0xD779F105, 0x920367FA ), // 3420341787669373425792804858
  133. W3( 0x2D4D7BA9, 0x0970D8CF, 0x8CE6D7EC ), // 14020496699328277892009744364
  134. W3( 0x00DC5971, 0x0209470E, 0x713F2B27 ), // 266386055561000736260041511
  135. W3( 0x27E54E26, 0x53BA0137, 0xDD6740B3 ), // 12347128447319282384829366451
  136. W3( 0x2143A889, 0x8F2B57F5, 0xFB8181C1 ), // 10294799249108063706647986625
  137. W3( 0x1125419F, 0x5C4E0608, 0xE0AC0396 ), // 5306285315793562029414679446
  138. W3( 0x15B61D90, 0x63A27BB0, 0x26402B32 ), // 6719349317556695539371748146
  139. W3( 0x03B582FC, 0x419EF656, 0xB06BBC35 ), // 1147889163765050226454019125
  140. W3( 0x08FF62E1, 0xA3BB1145, 0xDA998F77 ), // 2784623116803271439773437815
  141. W3( 0x101978AF, 0xF93CBFA1, 0xB788B5A3 ), // 4982553232749484200897852835
  142. W3( 0x061334DE, 0x8FE5C6E9, 0x2B2309D6 ), // 1880129318103954373583505878
  143. W3( 0x343C6E7C, 0x8019BB43, 0xD954E744 ), // 16166277816826816936484857668
  144. W3( 0x06506A03, 0x0E6DE333, 0xF8011494 ), // 1954124751724394051182597268
  145. W3( 0x34892A42, 0x6502DAA3, 0x8FDA6971 ), // 16259042912153157504364865905
  146. W3( 0x0EF2C4BD, 0xF42D9711, 0xC32CEA49 ), // 4626279273705729025744104009
  147. W3( 0x24511305, 0x4F1EAE2C, 0x62FB10F4 ), // 11239473167855288013010178292
  148. W3( 0x14E5A052, 0xF1748A9C, 0xDD536730 ), // 6467301317787608309692589872
  149. W3( 0x0621D0A7, 0x0A5188AF, 0x7316C352 ), // 1897789944553576071437927250
  150. W3( 0x234498F0, 0xDF078E95, 0x6FEED50B ), // 10914904542475816633386325259
  151. W3( 0x029E4925, 0x948D6D57, 0xD4DF93A6 ), // 810325725128913871737688998
  152. W3( 0x11BB3805, 0x0589D746, 0x852F3E2F ), // 5487578840386649632552205871
  153. W3( 0x1D4370CA, 0xA4441B85, 0xC9606FE0 ), // 9056595957858187419376971744
  154. W3( 0x1C536F7D, 0x77D44926, 0x8DDB8932 ), // 8766447615182890705620797746
  155. W3( 0x3498CE71, 0xB726A4D3, 0xF4F3C813 ), // 16277952140466335672647796755
  156. W3( 0x1E4A297E, 0xAC13196E, 0xFACD8102 ), // 9374206759006667727054930178
  157. W3( 0x0E7C2CCC, 0xC940C98B, 0x0BC0CA49 ), // 4482908500893894680116251209
  158. W3( 0x124CF912, 0xD84438FD, 0x9C03585F ), // 5663784755954194195257972831
  159. W3( 0x06180FF8, 0xD447BEBE, 0xDB8821E7 ), // 1885999704184999223512015335
  160. W3( 0x1ED2EB11, 0x0687EC7C, 0xBE3436C8 ), // 9539534786948152514714023624
  161. W3( 0x30EBB35C, 0x59616A3C, 0x502CBB52 ), // 15140225046175435352009653074
  162. W3( 0x33E24883, 0xEDA36D60, 0xA25C8E5F ), // 16057295180155395438855097951
  163. W3( 0x0D879ED9, 0x076BAB06, 0x9BE12AA2 ), // 4187260250707927256570866338
  164. W3( 0x1A1B6C9C, 0x0966383B, 0x54123A87 ), // 8079764146434082816365050503
  165. W3( 0x31BD863A, 0xA2A6505C, 0xD759E6CF ), // 15393886339893077529104869071
  166. W3( 0x3209AF0A, 0x5E5055A1, 0x480AF03F ), // 15485957428841754012708171839
  167. W3( 0x1A4CC03C, 0xC8AA650B, 0x7F4DBCE9 ), // 8139396433274519652257348841
  168. W3( 0x3596471F, 0xB99D2EA3, 0xA3433E0C ), // 16584380266717730797139475980
  169. W3( 0x28E87642, 0x98E21FCE, 0xDE1B53EA ), // 12660429650750886812340409322
  170. W3( 0x20161DB9, 0xCDC199E9, 0x0A6BEDF2 ), // 9930257058416521571476434418
  171. W3( 0x1D0DC095, 0x2C40D22B, 0x088549BA ), // 8991690766592354745340742074
  172. W3( 0x2FCC953C, 0xA8B62408, 0x50FC4C29 ), // 14793121080372138443684989993
  173. W3( 0x0F854B39, 0xF659B4B5, 0xD2B0A6AC ), // 4803417528030967235217499820
  174. W3( 0x30E087D4, 0x02F3BBAB, 0xBA503373 ), // 15126721285415891216108630899
  175. W3( 0x0DAF660C, 0x26B99C42, 0x98B8BE05 ), // 4235349051642660841298902533
  176. W3( 0x0ED6AE0E, 0xCD02982A, 0xD233F0D9 ), // 4592322227691334993146278105
  177. W3( 0x3415EB9B, 0x4B61C19F, 0xB21F1255 ), // 16119720573722492095181034069
  178. W3( 0x1015A729, 0x20A1FAA2, 0x0D094529 ), // 4977936993224010619482096937
  179. W3( 0x1D2E3AD2, 0x7093579F, 0x1C93C97B ), // 9030953651705465548198627707
  180. W3( 0x130EAA8F, 0x859C980F, 0xD9E7E8ED ), // 5897945597894388791627999469
  181. W3( 0x2B7CA1C8, 0xFC34C5B5, 0x9C0B1C0C ), // 13458526232475976507763399692
  182. W3( 0x22367055, 0xA53B526A, 0x7505EABE ), // 10588302813110450634719881918
  183. W3( 0x344FEF55, 0x0B77067F, 0x38999E77 ) // 16189855864848287589134343799
  184. };
  185. #endif
  186. // Define this if you want the external loops instead of inline operations.
  187. #define FFTP_EXTERNAL_LOOPS
  188. // Define this for (cheap) consistency checks.
  189. //#define DEBUG_FFTP
  190. // Define this for extensive consistency checks.
  191. //#define DEBUG_FFTP_OPERATIONS
  192. // Define the algorithm of the backward FFT:
  193. // Either FORWARD (a normal FFT followed by a permutation)
  194. // or RECIPROOT (an FFT with reciprocal root of unity)
  195. // or CLEVER (an FFT with reciprocal root of unity but clever computation
  196. // of the reciprocals).
  197. // Drawback of FORWARD: the permutation pass.
  198. // Drawback of RECIPROOT: need all the powers of the root, not only half of them.
  199. #define FORWARD 42
  200. #define RECIPROOT 43
  201. #define CLEVER 44
  202. #define FFTP_BACKWARD CLEVER
  203. // r := a + b
  204. static inline void add (const fftp_word& a, const fftp_word& b, fftp_word& r)
  205. {
  206. #ifdef FFTP_EXTERNAL_LOOPS
  207. add_loop_lsp(arrayLSDptr(a._w,3),arrayLSDptr(b._w,3),arrayLSDptr(r._w,3),3);
  208. #else
  209. var uint32 tmp;
  210. tmp = a.w0 + b.w0;
  211. if (tmp >= a.w0) {
  212. // no carry
  213. r.w0 = tmp;
  214. tmp = a.w1 + b.w1;
  215. if (tmp >= a.w1) goto no_carry_1; else goto carry_1;
  216. } else {
  217. // carry
  218. r.w0 = tmp;
  219. tmp = a.w1 + b.w1 + 1;
  220. if (tmp > a.w1) goto no_carry_1; else goto carry_1;
  221. }
  222. if (1) {
  223. no_carry_1: // no carry
  224. r.w1 = tmp;
  225. tmp = a.w2 + b.w2;
  226. } else {
  227. carry_1: // carry
  228. r.w1 = tmp;
  229. tmp = a.w2 + b.w2 + 1;
  230. }
  231. r.w2 = tmp;
  232. #endif
  233. }
  234. // r := a - b
  235. static inline void sub (const fftp_word& a, const fftp_word& b, fftp_word& r)
  236. {
  237. #ifdef FFTP_EXTERNAL_LOOPS
  238. sub_loop_lsp(arrayLSDptr(a._w,3),arrayLSDptr(b._w,3),arrayLSDptr(r._w,3),3);
  239. #else
  240. var uint32 tmp;
  241. tmp = a.w0 - b.w0;
  242. if (tmp <= a.w0) {
  243. // no carry
  244. r.w0 = tmp;
  245. tmp = a.w1 - b.w1;
  246. if (tmp <= a.w1) goto no_carry_1; else goto carry_1;
  247. } else {
  248. // carry
  249. r.w0 = tmp;
  250. tmp = a.w1 - b.w1 - 1;
  251. if (tmp < a.w1) goto no_carry_1; else goto carry_1;
  252. }
  253. if (1) {
  254. no_carry_1: // no carry
  255. r.w1 = tmp;
  256. tmp = a.w2 - b.w2;
  257. } else {
  258. carry_1: // carry
  259. r.w1 = tmp;
  260. tmp = a.w2 - b.w2 - 1;
  261. }
  262. r.w2 = tmp;
  263. #endif
  264. }
  265. // b := a >> 1
  266. static inline void shift (const fftp_word& a, fftp_word& b)
  267. {
  268. #ifdef FFTP_EXTERNAL_LOOPS
  269. #ifdef DEBUG_FFTP
  270. if (shiftrightcopy_loop_msp(arrayMSDptr(a._w,3),arrayMSDptr(b._w,3),3,1,0))
  271. throw runtime_exception();
  272. #else
  273. shiftrightcopy_loop_msp(arrayMSDptr(a._w,3),arrayMSDptr(b._w,3),3,1,0);
  274. #endif
  275. #else
  276. var uint32 tmp, carry;
  277. tmp = a.w2;
  278. b.w2 = a.w2 >> 1;
  279. carry = tmp << 31;
  280. tmp = a.w1;
  281. b.w1 = (tmp >> 1) | carry;
  282. carry = tmp << 31;
  283. tmp = a.w0;
  284. b.w0 = (tmp >> 1) | carry;
  285. #ifdef DEBUG_FFTP
  286. carry = tmp << 31;
  287. if (carry)
  288. throw runtime_exception();
  289. #endif
  290. #endif
  291. }
  292. #ifdef DEBUG_FFTP_OPERATIONS
  293. #define check_fftp_word(x) if (compare_loop_msp(arrayMSDptr((x)._w,3),arrayMSDptr(p._w,3),3) >= 0) throw runtime_exception()
  294. #else
  295. #define check_fftp_word(x)
  296. #endif
  297. // r := (a + b) mod p
  298. static inline void addp (const fftp_word& a, const fftp_word& b, fftp_word& r)
  299. {
  300. check_fftp_word(a); check_fftp_word(b);
  301. #ifdef FFTP_EXTERNAL_LOOPS
  302. add(a,b, r);
  303. if (compare_loop_msp(arrayMSDptr(r._w,3),arrayMSDptr(p._w,3),3) >= 0)
  304. sub(r,p, r);
  305. #else
  306. add(a,b, r);
  307. if ((r.w2 > p.w2)
  308. || ((r.w2 == p.w2)
  309. && ((r.w1 > p.w1)
  310. || ((r.w1 == p.w1)
  311. && (r.w0 >= p.w0)))))
  312. sub(r,p, r);
  313. #endif
  314. check_fftp_word(r);
  315. }
  316. // r := (a - b) mod p
  317. static inline void subp (const fftp_word& a, const fftp_word& b, fftp_word& r)
  318. {
  319. check_fftp_word(a); check_fftp_word(b);
  320. sub(a,b, r);
  321. if ((sint32)r.w2 < 0)
  322. add(r,p, r);
  323. check_fftp_word(r);
  324. }
  325. // r := (a * b) mod p
  326. static void mulp (const fftp_word& a, const fftp_word& b, fftp_word& r)
  327. {
  328. check_fftp_word(a); check_fftp_word(b);
  329. #if defined(FFT_P_94)
  330. var uintD c[6];
  331. var uintD* const cLSDptr = arrayLSDptr(c,6);
  332. // Multiply the two words, using the standard method.
  333. mulu_2loop(arrayLSDptr(a._w,3),3, arrayLSDptr(b._w,3),3, cLSDptr);
  334. // c[0..5] now contains the product.
  335. // Divide by p.
  336. // To divide c (0 <= c < p^2) by p = 2^n+1,
  337. // we set q := floor(c/2^n) and r := c - q*p = (c mod 2^n) - q.
  338. // If this becomes negative, set r := r + p (at most twice).
  339. // (This works because floor(c/p) <= q <= floor(c/p)+2.)
  340. // (Actually, here, 0 <= c <= (p-1)^2, hence
  341. // floor(c/p) <= q <= floor(c/p)+1, so we have
  342. // to set r := r + p at most once!)
  343. // n = 94 = 3*32-2 = 2*32+30.
  344. shiftleft_loop_lsp(cLSDptr lspop 3,3,2,lspref(cLSDptr,2)>>30);
  345. lspref(cLSDptr,2) &= bit(30)-1;
  346. // c[0..2] now contains q, c[3..5] contains (c mod 2^n).
  347. #if 0
  348. if (compare_loop_msp(cLSDptr lspop 6,arrayMSDptr(p._w,3),3) >= 0) // q >= p ?
  349. subfrom_loop_lsp(arrayLSDptr(p._w,3),cLSDptr lspop 3,3); // q -= p;
  350. #endif
  351. if (subfrom_loop_lsp(cLSDptr lspop 3,cLSDptr,3)) // (c mod 2^n) - q
  352. addto_loop_lsp(arrayLSDptr(p._w,3),cLSDptr,3);
  353. r.w2 = lspref(cLSDptr,2); r.w1 = lspref(cLSDptr,1); r.w0 = lspref(cLSDptr,0);
  354. #elif defined(FFT_P_92)
  355. var uintD c[7];
  356. var uintD* const cLSDptr = arrayLSDptr(c,7);
  357. // Multiply the two words, using the standard method.
  358. mulu_2loop(arrayLSDptr(a._w,3),3, arrayLSDptr(b._w,3),3, cLSDptr);
  359. // c[1..6] now contains the product.
  360. // Divide by p.
  361. // To divide c (0 <= c < p^2) by p = 7*2^n+1,
  362. // we set q := floor(floor(c/2^n)/7) and
  363. // r := c - q*p = (floor(c/2^n) mod 7)*2^n + (c mod 2^n) - q.
  364. // If this becomes negative, set r := r + p.
  365. // (As above, since 0 <= c <= (p-1)^2, we have
  366. // floor(c/p) <= q <= floor(c/p)+1, so we have
  367. // to set r := r + p at most once!)
  368. // n = 92 = 3*32-4 = 2*32+28.
  369. lspref(cLSDptr,6) = shiftleft_loop_lsp(cLSDptr lspop 3,3,4,lspref(cLSDptr,2)>>28);
  370. lspref(cLSDptr,2) &= bit(28)-1;
  371. // c[0..3] now contains floor(c/2^n), c[4..6] contains (c mod 2^n).
  372. var uintD remainder = divu_loop_msp(7,cLSDptr lspop 7,4);
  373. lspref(cLSDptr,2) |= remainder << 28;
  374. // c[0..3] now contains q, c[4..6] contains (c mod 7*2^n).
  375. #ifdef DEBUG_FFTP
  376. if (lspref(cLSDptr,6) > 0)
  377. throw runtime_exception();
  378. #endif
  379. #if 0
  380. if (compare_loop_msp(cLSDptr lspop 6,arrayMSDptr(p._w,3),3) >= 0) // q >= p ?
  381. subfrom_loop_lsp(arrayLSDptr(p._w,3),cLSDptr lspop 3,3); // q -= p;
  382. #endif
  383. if (subfrom_loop_lsp(cLSDptr lspop 3,cLSDptr,3)) // (c mod 2^n) - q
  384. addto_loop_lsp(arrayLSDptr(p._w,3),cLSDptr,3);
  385. r.w2 = lspref(cLSDptr,2); r.w1 = lspref(cLSDptr,1); r.w0 = lspref(cLSDptr,0);
  386. #else
  387. #error "mulp not implemented for this prime"
  388. #endif
  389. if ((sint32)r.w2 < 0)
  390. throw runtime_exception();
  391. check_fftp_word(r);
  392. }
  393. #ifdef DEBUG_FFTP_OPERATIONS
  394. static void mulp_doublecheck (const fftp_word& a, const fftp_word& b, fftp_word& r)
  395. {
  396. fftp_word zero, ma, mb, or;
  397. subp(a,a, zero);
  398. subp(zero,a, ma);
  399. subp(zero,b, mb);
  400. mulp(ma,mb, or);
  401. mulp(a,b, r);
  402. if (compare_loop_msp(arrayMSDptr(r._w,3),arrayMSDptr(or._w,3),3))
  403. throw runtime_exception();
  404. }
  405. #define mulp mulp_doublecheck
  406. #endif /* DEBUG_FFTP_OPERATIONS */
  407. // b := (a / 2) mod p
  408. static inline void shiftp (const fftp_word& a, fftp_word& b)
  409. {
  410. check_fftp_word(a);
  411. if (a.w0 & 1) {
  412. var fftp_word a_even;
  413. add(a,p, a_even);
  414. shift(a_even, b);
  415. } else
  416. shift(a, b);
  417. check_fftp_word(b);
  418. }
  419. #ifndef _BIT_REVERSE
  420. #define _BIT_REVERSE
  421. // Reverse an n-bit number x. n>0.
  422. static uintC bit_reverse (uintL n, uintC x)
  423. {
  424. var uintC y = 0;
  425. do {
  426. y <<= 1;
  427. y |= (x & 1);
  428. x >>= 1;
  429. } while (!(--n == 0));
  430. return y;
  431. }
  432. #endif
  433. // Compute an convolution mod p using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
  434. static void fftp_convolution (const uintL n, const uintC N, // N = 2^n
  435. fftp_word * x, // N words
  436. fftp_word * y, // N words
  437. fftp_word * z // N words result
  438. )
  439. {
  440. CL_ALLOCA_STACK;
  441. #if (FFTP_BACKWARD == RECIPROOT) || defined(DEBUG_FFTP)
  442. var fftp_word* const w = cl_alloc_array(fftp_word,N);
  443. #else
  444. var fftp_word* const w = cl_alloc_array(fftp_word,(N>>1)+1);
  445. #endif
  446. var uintC i;
  447. // Initialize w[i] to w^i, w a primitive N-th root of unity.
  448. w[0] = fftp_roots_of_1[0];
  449. w[1] = fftp_roots_of_1[n];
  450. #if (FFTP_BACKWARD == RECIPROOT) || defined(DEBUG_FFTP)
  451. for (i = 2; i < N; i++)
  452. mulp(w[i-1],fftp_roots_of_1[n], w[i]);
  453. #else // need only half of the roots
  454. for (i = 2; i < N>>1; i++)
  455. mulp(w[i-1],fftp_roots_of_1[n], w[i]);
  456. #endif
  457. #ifdef DEBUG_FFTP
  458. // Check that w is really a primitive N-th root of unity.
  459. {
  460. var fftp_word w_N;
  461. mulp(w[N-1],fftp_roots_of_1[n], w_N);
  462. if (!(w_N.w2 == 0 && w_N.w1 == 0 && w_N.w0 == 1))
  463. throw runtime_exception();
  464. w_N = w[N>>1];
  465. if (!(w_N.w2 == p.w2 && w_N.w1 == p.w1 && w_N.w0 == p.w0 - 1))
  466. throw runtime_exception();
  467. }
  468. #endif
  469. var bool squaring = (x == y);
  470. // Do an FFT of length N on x.
  471. {
  472. var sintL l;
  473. /* l = n-1 */ {
  474. var const uintC tmax = N>>1; // tmax = 2^(n-1)
  475. for (var uintC t = 0; t < tmax; t++) {
  476. var uintC i1 = t;
  477. var uintC i2 = i1 + tmax;
  478. // Butterfly: replace (x(i1),x(i2)) by
  479. // (x(i1) + x(i2), x(i1) - x(i2)).
  480. var fftp_word tmp;
  481. tmp = x[i2];
  482. subp(x[i1],tmp, x[i2]);
  483. addp(x[i1],tmp, x[i1]);
  484. }
  485. }
  486. for (l = n-2; l>=0; l--) {
  487. var const uintC smax = (uintC)1 << (n-1-l);
  488. var const uintC tmax = (uintC)1 << l;
  489. for (var uintC s = 0; s < smax; s++) {
  490. var uintC exp = bit_reverse(n-1-l,s) << l;
  491. for (var uintC t = 0; t < tmax; t++) {
  492. var uintC i1 = (s << (l+1)) + t;
  493. var uintC i2 = i1 + tmax;
  494. // Butterfly: replace (x(i1),x(i2)) by
  495. // (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)).
  496. var fftp_word tmp;
  497. mulp(x[i2],w[exp], tmp);
  498. subp(x[i1],tmp, x[i2]);
  499. addp(x[i1],tmp, x[i1]);
  500. }
  501. }
  502. }
  503. }
  504. // Do an FFT of length N on y.
  505. if (!squaring) {
  506. var sintL l;
  507. /* l = n-1 */ {
  508. var uintC const tmax = N>>1; // tmax = 2^(n-1)
  509. for (var uintC t = 0; t < tmax; t++) {
  510. var uintC i1 = t;
  511. var uintC i2 = i1 + tmax;
  512. // Butterfly: replace (y(i1),y(i2)) by
  513. // (y(i1) + y(i2), y(i1) - y(i2)).
  514. var fftp_word tmp;
  515. tmp = y[i2];
  516. subp(y[i1],tmp, y[i2]);
  517. addp(y[i1],tmp, y[i1]);
  518. }
  519. }
  520. for (l = n-2; l>=0; l--) {
  521. var const uintC smax = (uintC)1 << (n-1-l);
  522. var const uintC tmax = (uintC)1 << l;
  523. for (var uintC s = 0; s < smax; s++) {
  524. var uintC exp = bit_reverse(n-1-l,s) << l;
  525. for (var uintC t = 0; t < tmax; t++) {
  526. var uintC i1 = (s << (l+1)) + t;
  527. var uintC i2 = i1 + tmax;
  528. // Butterfly: replace (y(i1),y(i2)) by
  529. // (y(i1) + w^exp*y(i2), y(i1) - w^exp*y(i2)).
  530. var fftp_word tmp;
  531. mulp(y[i2],w[exp], tmp);
  532. subp(y[i1],tmp, y[i2]);
  533. addp(y[i1],tmp, y[i1]);
  534. }
  535. }
  536. }
  537. }
  538. // Multiply the transformed vectors into z.
  539. for (i = 0; i < N; i++)
  540. mulp(x[i],y[i], z[i]);
  541. // Undo an FFT of length N on z.
  542. {
  543. var uintL l;
  544. for (l = 0; l < n-1; l++) {
  545. var const uintC smax = (uintC)1 << (n-1-l);
  546. var const uintC tmax = (uintC)1 << l;
  547. #if FFTP_BACKWARD != CLEVER
  548. for (var uintC s = 0; s < smax; s++) {
  549. var uintC exp = bit_reverse(n-1-l,s) << l;
  550. #if FFTP_BACKWARD == RECIPROOT
  551. if (exp > 0)
  552. exp = N - exp; // negate exp (use w^-1 instead of w)
  553. #endif
  554. for (var uintC t = 0; t < tmax; t++) {
  555. var uintC i1 = (s << (l+1)) + t;
  556. var uintC i2 = i1 + tmax;
  557. // Inverse Butterfly: replace (z(i1),z(i2)) by
  558. // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)).
  559. var fftp_word sum;
  560. var fftp_word diff;
  561. addp(z[i1],z[i2], sum);
  562. subp(z[i1],z[i2], diff);
  563. shiftp(sum, z[i1]);
  564. mulp(diff,w[exp], diff); shiftp(diff, z[i2]);
  565. }
  566. }
  567. #else // FFTP_BACKWARD == CLEVER: clever handling of negative exponents
  568. /* s = 0, exp = 0 */ {
  569. for (var uintC t = 0; t < tmax; t++) {
  570. var uintC i1 = t;
  571. var uintC i2 = i1 + tmax;
  572. // Inverse Butterfly: replace (z(i1),z(i2)) by
  573. // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
  574. // with exp <-- 0.
  575. var fftp_word sum;
  576. var fftp_word diff;
  577. addp(z[i1],z[i2], sum);
  578. subp(z[i1],z[i2], diff);
  579. shiftp(sum, z[i1]);
  580. shiftp(diff, z[i2]);
  581. }
  582. }
  583. for (var uintC s = 1; s < smax; s++) {
  584. var uintC exp = bit_reverse(n-1-l,s) << l;
  585. exp = (N>>1) - exp; // negate exp (use w^-1 instead of w)
  586. for (var uintC t = 0; t < tmax; t++) {
  587. var uintC i1 = (s << (l+1)) + t;
  588. var uintC i2 = i1 + tmax;
  589. // Inverse Butterfly: replace (z(i1),z(i2)) by
  590. // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
  591. // with exp <-- (N/2 - exp).
  592. var fftp_word sum;
  593. var fftp_word diff;
  594. addp(z[i1],z[i2], sum);
  595. subp(z[i2],z[i1], diff); // note that w^(N/2) = -1
  596. shiftp(sum, z[i1]);
  597. mulp(diff,w[exp], diff); shiftp(diff, z[i2]);
  598. }
  599. }
  600. #endif
  601. }
  602. /* l = n-1 */ {
  603. var const uintC tmax = N>>1; // tmax = 2^(n-1)
  604. for (var uintC t = 0; t < tmax; t++) {
  605. var uintC i1 = t;
  606. var uintC i2 = i1 + tmax;
  607. // Inverse Butterfly: replace (z(i1),z(i2)) by
  608. // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/2).
  609. var fftp_word sum;
  610. var fftp_word diff;
  611. addp(z[i1],z[i2], sum);
  612. subp(z[i1],z[i2], diff);
  613. shiftp(sum, z[i1]);
  614. shiftp(diff, z[i2]);
  615. }
  616. }
  617. }
  618. #if FFTP_BACKWARD == FORWARD
  619. // Swap z[i] and z[N-i] for 0 < i < N/2.
  620. for (i = (N>>1)-1; i > 0; i--) {
  621. var fftp_word tmp = z[i];
  622. z[i] = z[N-i];
  623. z[N-i] = tmp;
  624. }
  625. #endif
  626. }
  627. static void mulu_fft_modp (const uintD* sourceptr1, uintC len1,
  628. const uintD* sourceptr2, uintC len2,
  629. uintD* destptr)
  630. // Es ist 2 <= len1 <= len2.
  631. {
  632. // Methode:
  633. // source1 ist ein Stück der Länge N1, source2 ein oder mehrere Stücke
  634. // der Länge N2, mit N1+N2 <= N, wobei N Zweierpotenz ist.
  635. // sum(i=0..N-1, x_i b^i) * sum(i=0..N-1, y_i b^i) wird errechnet,
  636. // indem man die beiden Polynome
  637. // sum(i=0..N-1, x_i T^i), sum(i=0..N-1, y_i T^i)
  638. // multipliziert, und zwar durch Fourier-Transformation (s.o.).
  639. var uint32 n;
  640. integerlengthC(len1-1, n=); // 2^(n-1) < len1 <= 2^n
  641. var uintC len = (uintC)1 << n; // kleinste Zweierpotenz >= len1
  642. // Wählt man N = len, so hat man ceiling(len2/(len-len1+1)) * FFT(len).
  643. // Wählt man N = 2*len, so hat man ceiling(len2/(2*len-len1+1)) * FFT(2*len).
  644. // Wir wählen das billigere von beiden:
  645. // Bei ceiling(len2/(len-len1+1)) <= 2 * ceiling(len2/(2*len-len1+1))
  646. // nimmt man N = len, bei ....... > ........ dagegen N = 2*len.
  647. // (Wahl von N = 4*len oder mehr bringt nur in Extremfällen etwas.)
  648. if (len2 > 2 * (len-len1+1) * (len2 <= (2*len-len1+1) ? 1 : ceiling(len2,(2*len-len1+1)))) {
  649. n = n+1;
  650. len = len << 1;
  651. }
  652. var const uintC N = len; // N = 2^n
  653. CL_ALLOCA_STACK;
  654. var fftp_word* const x = cl_alloc_array(fftp_word,N);
  655. var fftp_word* const y = cl_alloc_array(fftp_word,N);
  656. #ifdef DEBUG_FFTP
  657. var fftp_word* const z = cl_alloc_array(fftp_word,N);
  658. #else
  659. var fftp_word* const z = x; // put z in place of x - saves memory
  660. #endif
  661. var uintD* const tmpprod = cl_alloc_array(uintD,len1+1);
  662. var uintP i;
  663. var uintC destlen = len1+len2;
  664. clear_loop_lsp(destptr,destlen);
  665. do {
  666. var uintC len2p; // length of a piece of source2
  667. len2p = N - len1 + 1;
  668. if (len2p > len2)
  669. len2p = len2;
  670. // len2p = min(N-len1+1,len2).
  671. if (len2p == 1) {
  672. // cheap case
  673. var uintD* tmpptr = arrayLSDptr(tmpprod,len1+1);
  674. mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
  675. if (addto_loop_lsp(tmpptr,destptr,len1+1))
  676. if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
  677. throw runtime_exception();
  678. } else {
  679. var uintC destlenp = len1 + len2p - 1;
  680. // destlenp = min(N,destlen-1).
  681. var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
  682. // Fill factor x.
  683. {
  684. for (i = 0; i < len1; i++) {
  685. x[i].w0 = lspref(sourceptr1,i);
  686. x[i].w1 = 0;
  687. x[i].w2 = 0;
  688. }
  689. for (i = len1; i < N; i++) {
  690. x[i].w0 = 0;
  691. x[i].w1 = 0;
  692. x[i].w2 = 0;
  693. }
  694. }
  695. // Fill factor y.
  696. if (!squaring) {
  697. for (i = 0; i < len2p; i++) {
  698. y[i].w0 = lspref(sourceptr2,i);
  699. y[i].w1 = 0;
  700. y[i].w2 = 0;
  701. }
  702. for (i = len2p; i < N; i++) {
  703. y[i].w0 = 0;
  704. y[i].w1 = 0;
  705. y[i].w2 = 0;
  706. }
  707. }
  708. // Multiply.
  709. if (!squaring)
  710. fftp_convolution(n,N, &x[0], &y[0], &z[0]);
  711. else
  712. fftp_convolution(n,N, &x[0], &x[0], &z[0]);
  713. #ifdef DEBUG_FFTP
  714. // Check result.
  715. for (i = 0; i < N; i++)
  716. if (!(z[i].w2 < N))
  717. throw runtime_exception();
  718. #endif
  719. // Add result to destptr[-destlen..-1]:
  720. {
  721. var uintD* ptr = destptr;
  722. // ac2|ac1|ac0 are an accumulator.
  723. var uint32 ac0 = 0;
  724. var uint32 ac1 = 0;
  725. var uint32 ac2 = 0;
  726. var uint32 tmp;
  727. for (i = 0; i < destlenp; i++) {
  728. // Add z[i] to the accumulator.
  729. tmp = z[i].w0;
  730. if ((ac0 += tmp) < tmp) {
  731. if (++ac1 == 0)
  732. ++ac2;
  733. }
  734. tmp = z[i].w1;
  735. if ((ac1 += tmp) < tmp)
  736. ++ac2;
  737. tmp = z[i].w2;
  738. ac2 += tmp;
  739. // Add the accumulator's least significant word to destptr:
  740. tmp = lspref(ptr,0);
  741. if ((ac0 += tmp) < tmp) {
  742. if (++ac1 == 0)
  743. ++ac2;
  744. }
  745. lspref(ptr,0) = ac0;
  746. lsshrink(ptr);
  747. ac0 = ac1;
  748. ac1 = ac2;
  749. ac2 = 0;
  750. }
  751. // ac2 = 0.
  752. if (ac1 > 0) {
  753. if (!((i += 2) <= destlen))
  754. throw runtime_exception();
  755. tmp = lspref(ptr,0);
  756. if ((ac0 += tmp) < tmp)
  757. ++ac1;
  758. lspref(ptr,0) = ac0;
  759. lsshrink(ptr);
  760. tmp = lspref(ptr,0);
  761. ac1 += tmp;
  762. lspref(ptr,0) = ac1;
  763. lsshrink(ptr);
  764. if (ac1 < tmp)
  765. if (inc_loop_lsp(ptr,destlen-i))
  766. throw runtime_exception();
  767. } else if (ac0 > 0) {
  768. if (!((i += 1) <= destlen))
  769. throw runtime_exception();
  770. tmp = lspref(ptr,0);
  771. ac0 += tmp;
  772. lspref(ptr,0) = ac0;
  773. lsshrink(ptr);
  774. if (ac0 < tmp)
  775. if (inc_loop_lsp(ptr,destlen-i))
  776. throw runtime_exception();
  777. }
  778. }
  779. #ifdef DEBUG_FFTP
  780. // If destlenp < N, check that the remaining z[i] are 0.
  781. for (i = destlenp; i < N; i++)
  782. if (z[i].w2 > 0 || z[i].w1 > 0 || z[i].w0 > 0)
  783. throw runtime_exception();
  784. #endif
  785. }
  786. // Decrement len2.
  787. destptr = destptr lspop len2p;
  788. destlen -= len2p;
  789. sourceptr2 = sourceptr2 lspop len2p;
  790. len2 -= len2p;
  791. } while (len2 > 0);
  792. }
  793. #undef FFT_P_94
  794. #undef FFT_P_92
  795. #undef w0
  796. #undef w1
  797. #undef w2
  798. #undef W3