/3rd_party/llvm/utils/PerfectShuffle/PerfectShuffle.cpp

https://code.google.com/p/softart/ · C++ · 572 lines · 397 code · 92 blank · 83 comment · 112 complexity · bed6368b3ac61cc39c125476860f4fa5 MD5 · raw file

  1. //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file computes an optimal sequence of instructions for doing all shuffles
  11. // of two 4-element vectors. With a release build and when configured to emit
  12. // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
  13. // PowerPC G5.
  14. //
  15. //===----------------------------------------------------------------------===//
  16. #include <cassert>
  17. #include <cstdlib>
  18. #include <iomanip>
  19. #include <iostream>
  20. #include <vector>
  21. struct Operator;
  22. // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
  23. // an element from that value of the input vectors. A value of 8 means the
  24. // entry is undefined.
  25. // Mask manipulation functions.
  26. static inline unsigned short MakeMask(unsigned V0, unsigned V1,
  27. unsigned V2, unsigned V3) {
  28. return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
  29. }
  30. /// getMaskElt - Return element N of the specified mask.
  31. static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
  32. return (Mask >> ((3-Elt)*4)) & 0xF;
  33. }
  34. static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
  35. unsigned FieldShift = ((3-Elt)*4);
  36. return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
  37. }
  38. // Reject elements where the values are 9-15.
  39. static bool isValidMask(unsigned short Mask) {
  40. unsigned short UndefBits = Mask & 0x8888;
  41. return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
  42. }
  43. /// hasUndefElements - Return true if any of the elements in the mask are undefs
  44. ///
  45. static bool hasUndefElements(unsigned short Mask) {
  46. return (Mask & 0x8888) != 0;
  47. }
  48. /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
  49. /// including undef values..
  50. static bool isOnlyLHSMask(unsigned short Mask) {
  51. return (Mask & 0x4444) == 0;
  52. }
  53. /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
  54. /// refer to the LHS only (for when one argument value is passed into the same
  55. /// function twice).
  56. #if 0
  57. static unsigned short getLHSOnlyMask(unsigned short Mask) {
  58. return Mask & 0xBBBB; // Keep only LHS and Undefs.
  59. }
  60. #endif
  61. /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
  62. /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
  63. static unsigned getCompressedMask(unsigned short Mask) {
  64. return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
  65. getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
  66. }
  67. static void PrintMask(unsigned i, std::ostream &OS) {
  68. OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
  69. << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
  70. << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
  71. << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
  72. << ">";
  73. }
  74. /// ShuffleVal - This represents a shufflevector operation.
  75. struct ShuffleVal {
  76. unsigned Cost; // Number of instrs used to generate this value.
  77. Operator *Op; // The Operation used to generate this value.
  78. unsigned short Arg0, Arg1; // Input operands for this value.
  79. ShuffleVal() : Cost(1000000) {}
  80. };
  81. /// ShufTab - This is the actual shuffle table that we are trying to generate.
  82. ///
  83. static ShuffleVal ShufTab[65536];
  84. /// TheOperators - All of the operators that this target supports.
  85. static std::vector<Operator*> TheOperators;
  86. /// Operator - This is a vector operation that is available for use.
  87. struct Operator {
  88. unsigned short ShuffleMask;
  89. unsigned short OpNum;
  90. const char *Name;
  91. unsigned Cost;
  92. Operator(unsigned short shufflemask, const char *name, unsigned opnum,
  93. unsigned cost = 1)
  94. : ShuffleMask(shufflemask), OpNum(opnum), Name(name), Cost(cost) {
  95. TheOperators.push_back(this);
  96. }
  97. ~Operator() {
  98. assert(TheOperators.back() == this);
  99. TheOperators.pop_back();
  100. }
  101. bool isOnlyLHSOperator() const {
  102. return isOnlyLHSMask(ShuffleMask);
  103. }
  104. const char *getName() const { return Name; }
  105. unsigned getCost() const { return Cost; }
  106. unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
  107. // Extract the elements from LHSMask and RHSMask, as appropriate.
  108. unsigned Result = 0;
  109. for (unsigned i = 0; i != 4; ++i) {
  110. unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
  111. unsigned ResElt;
  112. if (SrcElt < 4)
  113. ResElt = getMaskElt(LHSMask, SrcElt);
  114. else if (SrcElt < 8)
  115. ResElt = getMaskElt(RHSMask, SrcElt-4);
  116. else {
  117. assert(SrcElt == 8 && "Bad src elt!");
  118. ResElt = 8;
  119. }
  120. Result |= ResElt << (4*i);
  121. }
  122. return Result;
  123. }
  124. };
  125. static const char *getZeroCostOpName(unsigned short Op) {
  126. if (ShufTab[Op].Arg0 == 0x0123)
  127. return "LHS";
  128. else if (ShufTab[Op].Arg0 == 0x4567)
  129. return "RHS";
  130. else {
  131. assert(0 && "bad zero cost operation");
  132. abort();
  133. }
  134. }
  135. static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
  136. unsigned short ThisOp = Vals[ValNo];
  137. std::cerr << "t" << ValNo;
  138. PrintMask(ThisOp, std::cerr);
  139. std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
  140. if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
  141. std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
  142. PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
  143. } else {
  144. // Figure out what tmp # it is.
  145. for (unsigned i = 0; ; ++i)
  146. if (Vals[i] == ShufTab[ThisOp].Arg0) {
  147. std::cerr << "t" << i;
  148. break;
  149. }
  150. }
  151. if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
  152. std::cerr << ", ";
  153. if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
  154. std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
  155. PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
  156. } else {
  157. // Figure out what tmp # it is.
  158. for (unsigned i = 0; ; ++i)
  159. if (Vals[i] == ShufTab[ThisOp].Arg1) {
  160. std::cerr << "t" << i;
  161. break;
  162. }
  163. }
  164. }
  165. std::cerr << ") ";
  166. }
  167. static unsigned getNumEntered() {
  168. unsigned Count = 0;
  169. for (unsigned i = 0; i != 65536; ++i)
  170. Count += ShufTab[i].Cost < 100;
  171. return Count;
  172. }
  173. static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
  174. unsigned &NumVals) {
  175. if (ShufTab[Elt].Cost == 0) return;
  176. // If this value has already been evaluated, it is free. FIXME: match undefs.
  177. for (unsigned i = 0, e = NumVals; i != e; ++i)
  178. if (Vals[i] == Elt) return;
  179. // Otherwise, get the operands of the value, then add it.
  180. unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
  181. if (ShufTab[Arg0].Cost)
  182. EvaluateOps(Arg0, Vals, NumVals);
  183. if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
  184. EvaluateOps(Arg1, Vals, NumVals);
  185. Vals[NumVals++] = Elt;
  186. }
  187. int main() {
  188. // Seed the table with accesses to the LHS and RHS.
  189. ShufTab[0x0123].Cost = 0;
  190. ShufTab[0x0123].Op = 0;
  191. ShufTab[0x0123].Arg0 = 0x0123;
  192. ShufTab[0x4567].Cost = 0;
  193. ShufTab[0x4567].Op = 0;
  194. ShufTab[0x4567].Arg0 = 0x4567;
  195. // Seed the first-level of shuffles, shuffles whose inputs are the input to
  196. // the vectorshuffle operation.
  197. bool MadeChange = true;
  198. unsigned OpCount = 0;
  199. while (MadeChange) {
  200. MadeChange = false;
  201. ++OpCount;
  202. std::cerr << "Starting iteration #" << OpCount << " with "
  203. << getNumEntered() << " entries established.\n";
  204. // Scan the table for two reasons: First, compute the maximum cost of any
  205. // operation left in the table. Second, make sure that values with undefs
  206. // have the cheapest alternative that they match.
  207. unsigned MaxCost = ShufTab[0].Cost;
  208. for (unsigned i = 1; i != 0x8889; ++i) {
  209. if (!isValidMask(i)) continue;
  210. if (ShufTab[i].Cost > MaxCost)
  211. MaxCost = ShufTab[i].Cost;
  212. // If this value has an undef, make it be computed the cheapest possible
  213. // way of any of the things that it matches.
  214. if (hasUndefElements(i)) {
  215. // This code is a little bit tricky, so here's the idea: consider some
  216. // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
  217. // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
  218. // there are 3 undefs, the number rises to 729 entries we have to scan,
  219. // and for the 4 undef case, we have to scan the whole table.
  220. //
  221. // Instead of doing this huge amount of scanning, we process the table
  222. // entries *in order*, and use the fact that 'u' is 8, larger than any
  223. // valid index. Given an entry like 7u4u then, we only need to scan
  224. // 7[0-7]4u - 8 entries. We can get away with this, because we already
  225. // know that each of 704u, 714u, 724u, etc contain the minimum value of
  226. // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
  227. unsigned UndefIdx;
  228. if (i & 0x8000)
  229. UndefIdx = 0;
  230. else if (i & 0x0800)
  231. UndefIdx = 1;
  232. else if (i & 0x0080)
  233. UndefIdx = 2;
  234. else if (i & 0x0008)
  235. UndefIdx = 3;
  236. else
  237. abort();
  238. unsigned MinVal = i;
  239. unsigned MinCost = ShufTab[i].Cost;
  240. // Scan the 8 entries.
  241. for (unsigned j = 0; j != 8; ++j) {
  242. unsigned NewElt = setMaskElt(i, UndefIdx, j);
  243. if (ShufTab[NewElt].Cost < MinCost) {
  244. MinCost = ShufTab[NewElt].Cost;
  245. MinVal = NewElt;
  246. }
  247. }
  248. // If we found something cheaper than what was here before, use it.
  249. if (i != MinVal) {
  250. MadeChange = true;
  251. ShufTab[i] = ShufTab[MinVal];
  252. }
  253. }
  254. }
  255. for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
  256. if (!isValidMask(LHS)) continue;
  257. if (ShufTab[LHS].Cost > 1000) continue;
  258. // If nothing involving this operand could possibly be cheaper than what
  259. // we already have, don't consider it.
  260. if (ShufTab[LHS].Cost + 1 >= MaxCost)
  261. continue;
  262. for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
  263. Operator *Op = TheOperators[opnum];
  264. // Evaluate op(LHS,LHS)
  265. unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
  266. unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
  267. if (Cost < ShufTab[ResultMask].Cost) {
  268. ShufTab[ResultMask].Cost = Cost;
  269. ShufTab[ResultMask].Op = Op;
  270. ShufTab[ResultMask].Arg0 = LHS;
  271. ShufTab[ResultMask].Arg1 = LHS;
  272. MadeChange = true;
  273. }
  274. // If this is a two input instruction, include the op(x,y) cases. If
  275. // this is a one input instruction, skip this.
  276. if (Op->isOnlyLHSOperator()) continue;
  277. for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
  278. if (!isValidMask(RHS)) continue;
  279. if (ShufTab[RHS].Cost > 1000) continue;
  280. // If nothing involving this operand could possibly be cheaper than
  281. // what we already have, don't consider it.
  282. if (ShufTab[RHS].Cost + 1 >= MaxCost)
  283. continue;
  284. // Evaluate op(LHS,RHS)
  285. unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
  286. if (ShufTab[ResultMask].Cost <= OpCount ||
  287. ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
  288. ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
  289. continue;
  290. // Figure out the cost to evaluate this, knowing that CSE's only need
  291. // to be evaluated once.
  292. unsigned short Vals[30];
  293. unsigned NumVals = 0;
  294. EvaluateOps(LHS, Vals, NumVals);
  295. EvaluateOps(RHS, Vals, NumVals);
  296. unsigned Cost = NumVals + Op->getCost();
  297. if (Cost < ShufTab[ResultMask].Cost) {
  298. ShufTab[ResultMask].Cost = Cost;
  299. ShufTab[ResultMask].Op = Op;
  300. ShufTab[ResultMask].Arg0 = LHS;
  301. ShufTab[ResultMask].Arg1 = RHS;
  302. MadeChange = true;
  303. }
  304. }
  305. }
  306. }
  307. }
  308. std::cerr << "Finished Table has " << getNumEntered()
  309. << " entries established.\n";
  310. unsigned CostArray[10] = { 0 };
  311. // Compute a cost histogram.
  312. for (unsigned i = 0; i != 65536; ++i) {
  313. if (!isValidMask(i)) continue;
  314. if (ShufTab[i].Cost > 9)
  315. ++CostArray[9];
  316. else
  317. ++CostArray[ShufTab[i].Cost];
  318. }
  319. for (unsigned i = 0; i != 9; ++i)
  320. if (CostArray[i])
  321. std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
  322. if (CostArray[9])
  323. std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
  324. // Build up the table to emit.
  325. std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
  326. std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
  327. for (unsigned i = 0; i != 0x8889; ++i) {
  328. if (!isValidMask(i)) continue;
  329. // CostSat - The cost of this operation saturated to two bits.
  330. unsigned CostSat = ShufTab[i].Cost;
  331. if (CostSat > 4) CostSat = 4;
  332. if (CostSat == 0) CostSat = 1;
  333. --CostSat; // Cost is now between 0-3.
  334. unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
  335. assert(OpNum < 16 && "Too few bits to encode operation!");
  336. unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
  337. unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
  338. // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
  339. // LHS, and 13 bits of RHS = 32 bits.
  340. unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
  341. std::cout << " " << std::setw(10) << Val << "U, // ";
  342. PrintMask(i, std::cout);
  343. std::cout << ": Cost " << ShufTab[i].Cost;
  344. std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
  345. std::cout << " ";
  346. if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
  347. std::cout << getZeroCostOpName(ShufTab[i].Arg0);
  348. } else {
  349. PrintMask(ShufTab[i].Arg0, std::cout);
  350. }
  351. if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
  352. std::cout << ", ";
  353. if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
  354. std::cout << getZeroCostOpName(ShufTab[i].Arg1);
  355. } else {
  356. PrintMask(ShufTab[i].Arg1, std::cout);
  357. }
  358. }
  359. std::cout << "\n";
  360. }
  361. std::cout << " 0\n};\n";
  362. if (0) {
  363. // Print out the table.
  364. for (unsigned i = 0; i != 0x8889; ++i) {
  365. if (!isValidMask(i)) continue;
  366. if (ShufTab[i].Cost < 1000) {
  367. PrintMask(i, std::cerr);
  368. std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
  369. unsigned short Vals[30];
  370. unsigned NumVals = 0;
  371. EvaluateOps(i, Vals, NumVals);
  372. for (unsigned j = 0, e = NumVals; j != e; ++j)
  373. PrintOperation(j, Vals);
  374. std::cerr << "\n";
  375. }
  376. }
  377. }
  378. }
  379. #ifdef GENERATE_ALTIVEC
  380. ///===---------------------------------------------------------------------===//
  381. /// The altivec instruction definitions. This is the altivec-specific part of
  382. /// this file.
  383. ///===---------------------------------------------------------------------===//
  384. // Note that the opcode numbers here must match those in the PPC backend.
  385. enum {
  386. OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  387. OP_VMRGHW,
  388. OP_VMRGLW,
  389. OP_VSPLTISW0,
  390. OP_VSPLTISW1,
  391. OP_VSPLTISW2,
  392. OP_VSPLTISW3,
  393. OP_VSLDOI4,
  394. OP_VSLDOI8,
  395. OP_VSLDOI12
  396. };
  397. struct vmrghw : public Operator {
  398. vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
  399. } the_vmrghw;
  400. struct vmrglw : public Operator {
  401. vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
  402. } the_vmrglw;
  403. template<unsigned Elt>
  404. struct vspltisw : public Operator {
  405. vspltisw(const char *N, unsigned Opc)
  406. : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
  407. };
  408. vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
  409. vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
  410. vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
  411. vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
  412. template<unsigned N>
  413. struct vsldoi : public Operator {
  414. vsldoi(const char *Name, unsigned Opc)
  415. : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
  416. }
  417. };
  418. vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
  419. vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
  420. vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
  421. #endif
  422. #define GENERATE_NEON
  423. #ifdef GENERATE_NEON
  424. enum {
  425. OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  426. OP_VREV,
  427. OP_VDUP0,
  428. OP_VDUP1,
  429. OP_VDUP2,
  430. OP_VDUP3,
  431. OP_VEXT1,
  432. OP_VEXT2,
  433. OP_VEXT3,
  434. OP_VUZPL, // VUZP, left result
  435. OP_VUZPR, // VUZP, right result
  436. OP_VZIPL, // VZIP, left result
  437. OP_VZIPR, // VZIP, right result
  438. OP_VTRNL, // VTRN, left result
  439. OP_VTRNR // VTRN, right result
  440. };
  441. struct vrev : public Operator {
  442. vrev() : Operator(0x1032, "vrev", OP_VREV) {}
  443. } the_vrev;
  444. template<unsigned Elt>
  445. struct vdup : public Operator {
  446. vdup(const char *N, unsigned Opc)
  447. : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
  448. };
  449. vdup<0> the_vdup0("vdup0", OP_VDUP0);
  450. vdup<1> the_vdup1("vdup1", OP_VDUP1);
  451. vdup<2> the_vdup2("vdup2", OP_VDUP2);
  452. vdup<3> the_vdup3("vdup3", OP_VDUP3);
  453. template<unsigned N>
  454. struct vext : public Operator {
  455. vext(const char *Name, unsigned Opc)
  456. : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
  457. }
  458. };
  459. vext<1> the_vext1("vext1", OP_VEXT1);
  460. vext<2> the_vext2("vext2", OP_VEXT2);
  461. vext<3> the_vext3("vext3", OP_VEXT3);
  462. struct vuzpl : public Operator {
  463. vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
  464. } the_vuzpl;
  465. struct vuzpr : public Operator {
  466. vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
  467. } the_vuzpr;
  468. struct vzipl : public Operator {
  469. vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
  470. } the_vzipl;
  471. struct vzipr : public Operator {
  472. vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
  473. } the_vzipr;
  474. struct vtrnl : public Operator {
  475. vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
  476. } the_vtrnl;
  477. struct vtrnr : public Operator {
  478. vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}
  479. } the_vtrnr;
  480. #endif