PageRenderTime 67ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 1ms

/ptlsim/lib/logic.h

https://github.com/stefanneumann/marss
C Header | 1876 lines | 1395 code | 352 blank | 129 comment | 109 complexity | d8ed1b0c58eeb24946ee0035322ef878 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-2.0
  1. // -*- c++ -*-
  2. //
  3. // Sequential Logic Primitives for C++
  4. //
  5. // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
  6. //
  7. #ifndef _LOGIC_H_
  8. #define _LOGIC_H_
  9. #include <globals.h>
  10. #include <superstl.h>
  11. inline vec16b x86_sse_ldvbu(const vec16b* m) { vec16b rd; asm("movdqu %[m],%[rd]" : [rd] "=x" (rd) : [m] "xm" (*m)); return rd; }
  12. inline void x86_sse_stvbu(vec16b* m, const vec16b ra) { asm("movdqu %[ra],%[m]" : [m] "=xm" (*m) : [ra] "x" (ra) : "memory"); }
  13. inline vec8w x86_sse_ldvwu(const vec8w* m) {
  14. vec8w rd;
  15. asm("movdqu %[m],%[rd]" : [rd] "=x" (rd) : [m] "m" (*m));
  16. return rd;
  17. }
  18. //inline vec8w x86_sse_ldvwu(const vec8w* m) { vec8w rd; asm("movdqu %[rd], %[m]" : [rd] "=x" (rd) : [m] "xm" (*m)); return rd; }
  19. inline void x86_sse_stvwu(vec8w* m, const vec8w ra) { asm("movdqu %[ra],%[m]" : [m] "=m" (*m) : [ra] "x" (ra) : "memory"); }
  20. extern ofstream ptl_logfile;
  21. extern ofstream yaml_stats_file;
  22. template <typename T>
  23. struct latch {
  24. T data;
  25. T newdata;
  26. latch() {
  27. reset();
  28. }
  29. void reset(const T& d = T()) {
  30. data = d;
  31. newdata = d;
  32. }
  33. latch(const T& t) { newdata = t; }
  34. operator T() const { return data; }
  35. T& operator =(const T& t) {
  36. newdata = t; return data;
  37. }
  38. void clock(bool clkenable = true) {
  39. if (clkenable)
  40. data = newdata;
  41. }
  42. };
  43. template <typename T, int size>
  44. struct SynchronousRegisterFile {
  45. SynchronousRegisterFile() {
  46. reset();
  47. }
  48. void reset() {
  49. for (int i = 0; i < size; i++) {
  50. data[i].data = 0;
  51. data[i].newdata = 0;
  52. }
  53. }
  54. latch<T> data[size];
  55. latch<T>& operator [](int i) {
  56. return data[i];
  57. }
  58. void clock(bool clkenable = true) {
  59. if (!clkenable)
  60. return;
  61. for (int i = 0; i < size; i++) {
  62. data[i].clock();
  63. }
  64. }
  65. };
  66. //
  67. // Queue
  68. //
  69. // Iterate forward through queue from head to tail
  70. #define foreach_forward(Q, i) for (int i = (Q).head; i != (Q).tail; i = add_index_modulo(i, +1, (Q).size))
  71. // Iterate forward through queue from the specified entry until the tail
  72. #define foreach_forward_from(Q, E, i) for (int i = E->index(); i != (Q).tail; i = add_index_modulo(i, +1, (Q).size))
  73. // Iterate forward through queue from the entry after the specified entry until the tail
  74. #define foreach_forward_after(Q, E, i) for (int i = add_index_modulo(E->index(), +1, (Q).size); i != (Q).tail; i = add_index_modulo(i, +1, (Q).size))
  75. // Iterate backward through queue from tail to head
  76. #define foreach_backward(Q, i) for (int i = add_index_modulo((Q).tail, -1, (Q).size); i != add_index_modulo((Q).head, -1, (Q).size); i = add_index_modulo(i, -1, (Q).size))
  77. // Iterate backward through queue from the specified entry until the tail
  78. #define foreach_backward_from(Q, E, i) for (int i = E->index(); i != add_index_modulo((Q).head, -1, (Q).size); i = add_index_modulo(i, -1, (Q).size))
  79. // Iterate backward through queue from the entry before the specified entry until the head
  80. #define foreach_backward_before(Q, E, i) for (int i = add_index_modulo(E->index(), -1, (Q).size); ((i != add_index_modulo((Q).head, -1, (Q).size)) && (E->index() != (Q).head)); i = add_index_modulo(i, -1, (Q).size))
  81. template <class T, int SIZE>
  82. struct FixedQueue: public array<T, SIZE> {
  83. int head; // used for allocation
  84. int tail; // used for deallocation
  85. int count; // count of entries
  86. static const int size = SIZE;
  87. FixedQueue() {
  88. reset();
  89. }
  90. void flush() {
  91. head = tail = count = 0;
  92. }
  93. void reset() {
  94. head = tail = count = 0;
  95. }
  96. int remaining() const {
  97. return max((SIZE - count) - 1, 0);
  98. }
  99. bool empty() const {
  100. return (!count);
  101. }
  102. bool full() const {
  103. return (!remaining());
  104. }
  105. T* alloc() {
  106. if (!remaining())
  107. return NULL;
  108. T* entry = &(*this)[tail];
  109. tail = add_index_modulo(tail, +1, SIZE);
  110. count++;
  111. return entry;
  112. }
  113. T* push() {
  114. return alloc();
  115. }
  116. T* push(const T& data) {
  117. T* slot = push();
  118. if (!slot) return NULL;
  119. *slot = data;
  120. return slot;
  121. }
  122. T* enqueue(const T& data) {
  123. return push(data);
  124. }
  125. void commit(T& entry) {
  126. assert(entry.index() == head);
  127. count--;
  128. head = add_index_modulo(head, +1, SIZE);
  129. }
  130. void annul(T& entry) {
  131. // assert(entry.index() == add_index_modulo(tail, -1, SIZE));
  132. count--;
  133. tail = add_index_modulo(tail, -1, SIZE);
  134. }
  135. T* pop() {
  136. if (empty()) return NULL;
  137. tail = add_index_modulo(tail, -1, SIZE);
  138. count--;
  139. return &(*this)[tail];
  140. }
  141. T* peek() {
  142. if (empty())
  143. return NULL;
  144. return &(*this)[head];
  145. }
  146. T* dequeue() {
  147. if (empty())
  148. return NULL;
  149. count--;
  150. T* entry = &(*this)[head];
  151. head = add_index_modulo(head, +1, SIZE);
  152. return entry;
  153. }
  154. void commit(T* entry) { commit(*entry); }
  155. void annul(T* entry) { annul(*entry); }
  156. T* pushhead() {
  157. if (full()) return NULL;
  158. head = add_index_modulo(head, -1, SIZE);
  159. count++;
  160. return &(*this)[head];
  161. }
  162. T* pophead() {
  163. if (empty()) return NULL;
  164. T* p = &(*this)[head];
  165. count--;
  166. head = add_index_modulo(head, +1, SIZE);
  167. return p;
  168. }
  169. T* peekhead() {
  170. if (empty()) return NULL;
  171. return &(*this)[head];
  172. }
  173. T* peektail() {
  174. if (empty()) return NULL;
  175. int t = add_index_modulo(tail, -1, SIZE);
  176. return &(*this)[t];
  177. }
  178. T& operator ()(int index) {
  179. index = add_index_modulo(head, index, SIZE);
  180. return (*this)[index];
  181. }
  182. const T& operator ()(int index) const {
  183. index = add_index_modulo(head, index, SIZE);
  184. return (*this)[index];
  185. }
  186. ostream& print(ostream& os) const {
  187. os << "Queue<", SIZE, ">: head ", head, " to tail ", tail, " (", count, " entries):", endl;
  188. foreach_forward((*this), i) {
  189. const T& entry = (*this)[i];
  190. os << " slot ", intstring(i, 3), ": ", entry, endl;
  191. }
  192. return os;
  193. }
  194. };
  195. template <class T, int SIZE>
  196. ostream& operator <<(ostream& os, FixedQueue<T, SIZE>& queue) {
  197. return queue.print(os);
  198. }
  199. template <class T, int SIZE>
  200. struct Queue: public FixedQueue<T, SIZE> {
  201. typedef FixedQueue<T, SIZE> base_t;
  202. Queue() {
  203. reset();
  204. }
  205. void reset() {
  206. base_t::reset();
  207. foreach (i, SIZE) {
  208. (*this)[i].init(i);
  209. }
  210. }
  211. T* alloc() {
  212. T* p = base_t::alloc();
  213. if likely (p) p->validate();
  214. return p;
  215. }
  216. };
  217. template <class T, int size>
  218. ostream& operator <<(ostream& os, const Queue<T, size>& queue) {
  219. os << "Queue<", size, "]: head ", queue.head, " to tail ", queue.tail, " (", queue.count, " entries):", endl;
  220. foreach_forward(queue, i) {
  221. const T& entry = queue[i];
  222. os << " ", entry, endl;
  223. }
  224. return os;
  225. }
  226. template <typename T, int size>
  227. struct HistoryBuffer: public array<T, size> {
  228. int current;
  229. T prevoldest;
  230. void reset() {
  231. current = size-1;
  232. setzero(this->data);
  233. }
  234. HistoryBuffer() {
  235. reset();
  236. }
  237. //
  238. // Enqueue t at the tail of the queue, making the results
  239. // visible for possible dequeueing by an earlier pipeline
  240. // stage within the same cycle (i.e., forwarding is used).
  241. // If this is not desirable, use enqueuesync() instead.
  242. //
  243. void add(const T& t) {
  244. current = add_index_modulo(current, +1, size);
  245. prevoldest = this->data[current];
  246. this->data[current] = t;
  247. }
  248. /*
  249. * Undo last addition
  250. */
  251. void undo() {
  252. this->data[current] = prevoldest;
  253. current = add_index_modulo(current, -1, size);
  254. }
  255. /*
  256. * Index backwards in time: 0 = most recent addition
  257. */
  258. T& operator [](int index) {
  259. int idx = add_index_modulo(current, -index, size);
  260. //assert(inrange(idx, 0, size-1));
  261. return this->data[idx];
  262. }
  263. const T& operator [](int index) const {
  264. int idx = add_index_modulo(current, -index, size);
  265. //assert(inrange(idx, 0, size-1));
  266. return this->data[idx];
  267. }
  268. };
  269. template <class T, int size>
  270. ostream& operator <<(ostream& os, HistoryBuffer<T, size>& history) {
  271. os << "HistoryBuffer[", size, "]: current = ", history.current, ", prevoldest = ", history.prevoldest, endl;
  272. for (int i = 0; i < size; i++) {
  273. os << " ", history[i], endl;
  274. }
  275. return os;
  276. }
  277. //
  278. // Fully Associative Arrays
  279. //
  280. template <typename T> struct InvalidTag { static const T INVALID; };
  281. template <> struct InvalidTag<W64> { static const W64 INVALID = 0xffffffffffffffffULL; };
  282. template <> struct InvalidTag<W32> { static const W32 INVALID = 0xffffffff; };
  283. template <> struct InvalidTag<W16> { static const W16 INVALID = 0xffff; };
  284. template <> struct InvalidTag<W8> { static const W8 INVALID = 0xff; };
  285. //
  286. // The replacement policy is pseudo-LRU using a most recently used
  287. // bit vector (mLRU), as described in the paper "Performance Evaluation
  288. // of Cache Replacement Policies for the SPEC CPU2000 Benchmark Suite"
  289. // by Al-Zoubi et al. Essentially we maintain one MRU bit per way and
  290. // set the bit for the way when that way is accessed. The way to evict
  291. // is the first way without its MRU bit set. If all MRU bits become
  292. // set, they are all reset and we start over. Surprisingly, this
  293. // simple method performs as good as, if not better than, true LRU
  294. // or tree-based hot sector LRU.
  295. //
  296. template <typename T, int ways>
  297. struct FullyAssociativeTags {
  298. bitvec<ways> evictmap;
  299. T tags[ways];
  300. static const T INVALID = InvalidTag<T>::INVALID;
  301. FullyAssociativeTags() {
  302. reset();
  303. }
  304. void reset() {
  305. evictmap = 0;
  306. foreach (i, ways) {
  307. tags[i] = INVALID;
  308. }
  309. }
  310. void use(int way) {
  311. evictmap[way] = 1;
  312. // Performance is somewhat better with this off with higher associativity caches:
  313. // if (evictmap.allset()) evictmap = 0;
  314. }
  315. //
  316. // This is a clever way of doing branch-free matching
  317. // with conditional moves and addition. It relies on
  318. // having at most one matching entry in the array;
  319. // otherwise the algorithm breaks:
  320. //
  321. int match(T target) {
  322. int way = 0;
  323. foreach (i, ways) {
  324. way += (tags[i] == target) ? (i + 1) : 0;
  325. }
  326. return way - 1;
  327. }
  328. int probe(T target) {
  329. int way = match(target);
  330. if (way < 0) return -1;
  331. use(way);
  332. return way;
  333. }
  334. int lru() const {
  335. return (evictmap.allset()) ? 0 : (~evictmap).lsb();
  336. }
  337. int select(T target, T& oldtag) {
  338. int way = probe(target);
  339. if (way < 0) {
  340. way = lru();
  341. if (evictmap.allset()) evictmap = 0;
  342. oldtag = tags[way];
  343. tags[way] = target;
  344. }
  345. use(way);
  346. if (evictmap.allset()) {
  347. evictmap = 0;
  348. use(way);
  349. }
  350. return way;
  351. }
  352. int select(T target) {
  353. T dummy;
  354. return select(target, dummy);
  355. }
  356. void invalidate_way(int way) {
  357. tags[way] = INVALID;
  358. evictmap[way] = 0;
  359. }
  360. int invalidate(T target) {
  361. int way = probe(target);
  362. if (way < 0) return -1;
  363. invalidate_way(way);
  364. return way;
  365. }
  366. const T& operator [](int index) const { return tags[index]; }
  367. T& operator [](int index) { return tags[index]; }
  368. int operator ()(T target) { return probe(target); }
  369. stringbuf& printway(stringbuf& os, int i) const {
  370. os << " way ", intstring(i, -2), ": ";
  371. if (tags[i] != INVALID) {
  372. os << "tag 0x", hexstring(tags[i], sizeof(T)*8);
  373. if (evictmap[i]) os << " (MRU)";
  374. } else {
  375. os << "<invalid>";
  376. }
  377. return os;
  378. }
  379. stringbuf& print(stringbuf& os) const {
  380. foreach (i, ways) {
  381. printway(os, i);
  382. os << endl;
  383. }
  384. return os;
  385. }
  386. ostream& print(ostream& os) const {
  387. stringbuf sb;
  388. print(sb);
  389. os << sb;
  390. return os;
  391. }
  392. };
  393. template <typename T, int ways>
  394. ostream& operator <<(ostream& os, const FullyAssociativeTags<T, ways>& tags) {
  395. return tags.print(os);
  396. }
  397. template <typename T, int ways>
  398. stringbuf& operator <<(stringbuf& sb, const FullyAssociativeTags<T, ways>& tags) {
  399. return tags.print(sb);
  400. }
  401. //
  402. // Associative array implemented using vectorized
  403. // comparisons spread across multiple byte slices
  404. // and executed in parallel.
  405. //
  406. // This implementation is roughly 2-4x as fast
  407. // as the naive scalar code on SSE2 machines,
  408. // especially for larger arrays.
  409. //
  410. // Very small arrays (less than 8 entries) should
  411. // use the normal scalar FullyAssociativeTags
  412. // for best performance. Both classes use the
  413. // same principle for very fast one-hot matching.
  414. //
  415. // Limitations:
  416. //
  417. // - Every tag in the array must be unique,
  418. // except for the invalid tag (all 1s)
  419. //
  420. // - <size> can be from 1 to 128. Technically
  421. // up to 254, however element 255 cannot
  422. // be used. Matching is done in groups
  423. // of 16 elements in parallel.
  424. //
  425. // - <width> in bits can be from 1 to 64
  426. //
  427. template <int size, int width, int padsize = 0>
  428. struct FullyAssociativeTagsNbitOneHot {
  429. typedef vec16b vec_t;
  430. typedef W64 base_t;
  431. static const int slices = (width + 7) / 8;
  432. static const int chunkcount = (size+15) / 16;
  433. static const int padchunkcount = (padsize+15) / 16;
  434. vec16b tags[slices][chunkcount + padchunkcount] alignto(16);
  435. base_t tagsmirror[size]; // for fast scalar access
  436. bitvec<size> valid;
  437. bitvec<size> evictmap;
  438. FullyAssociativeTagsNbitOneHot() {
  439. reset();
  440. }
  441. void reset() {
  442. valid = 0;
  443. evictmap = 0;
  444. memset(tags, 0xff, sizeof(tags));
  445. memset(tagsmirror, 0xff, sizeof(tagsmirror));
  446. }
  447. int match(const vec16b* targetslices) const {
  448. vec16b sum = x86_sse_zerob();
  449. foreach (i, chunkcount) {
  450. vec16b eq = *((vec16b*)&index_bytes_plus1_vec16b[i]);
  451. foreach (j, slices) {
  452. eq = x86_sse_pandb(x86_sse_pcmpeqb(tags[j][i], targetslices[j]), eq);
  453. }
  454. sum = x86_sse_psadbw(sum, eq);
  455. }
  456. int idx = (x86_sse_pextrw<0>(sum) + x86_sse_pextrw<4>(sum));
  457. return idx-1;
  458. }
  459. static void prep(vec16b* targetslices, base_t tag) {
  460. foreach (i, slices) {
  461. targetslices[i] = x86_sse_dupb((byte)tag);
  462. tag >>= 8;
  463. }
  464. }
  465. int match(base_t tag) const {
  466. vec16b targetslices[16];
  467. prep(targetslices, tag);
  468. return match(targetslices);
  469. }
  470. int search(base_t tag) const {
  471. return match(tag);
  472. }
  473. int operator()(base_t tag) const {
  474. return search(tag);
  475. }
  476. void update(int index, base_t tag) {
  477. // Spread it across all the words
  478. base_t t = tag;
  479. foreach (i, slices) {
  480. *(((byte*)(&tags[i])) + index) = (byte)t;
  481. t >>= 8;
  482. }
  483. tagsmirror[index] = tag;
  484. valid[index] = 1;
  485. evictmap[index] = 1;
  486. }
  487. class ref {
  488. friend class FullyAssociativeTagsNbitOneHot;
  489. FullyAssociativeTagsNbitOneHot<size, width, padsize>& tags;
  490. int index;
  491. ref();
  492. public:
  493. inline ref(FullyAssociativeTagsNbitOneHot& tags_, int index_): tags(tags_), index(index_) { }
  494. inline ~ref() { }
  495. inline ref& operator =(base_t tag) {
  496. tags.update(index, tag);
  497. return *this;
  498. }
  499. inline ref& operator =(const ref& other) {
  500. tags.update(index, other.tagsmirror[other.index]);
  501. return *this;
  502. }
  503. };
  504. friend class ref;
  505. ref operator [](int index) { return ref(*this, index); }
  506. base_t operator [](int index) const { return tagsmirror[index]; }
  507. bool isvalid(int index) {
  508. return valid[index];
  509. }
  510. int insertslot(int idx, base_t tag) {
  511. valid[idx] = 1;
  512. (*this)[idx] = tag;
  513. return idx;
  514. }
  515. int insert(base_t tag) {
  516. if (valid.allset()) return -1;
  517. int idx = (~valid).lsb();
  518. return insertslot(idx, tag);
  519. }
  520. void invalidateslot(int index) {
  521. valid[index] = 0;
  522. (*this)[index] = 0xffffffffffffffffULL; // invalid marker
  523. }
  524. void validateslot(int index) {
  525. valid[index] = 1;
  526. }
  527. int invalidate(base_t target) {
  528. int index = match(target);
  529. if (index < 0) return 0;
  530. invalidateslot(index);
  531. return 1;
  532. }
  533. bitvec<size> masked_match(base_t targettag, base_t tagmask) {
  534. bitvec<size> m;
  535. foreach (i, size) {
  536. base_t tag = tagsmirror[i];
  537. m[i] = ((tag & tagmask) == targettag);
  538. }
  539. return m;
  540. }
  541. void masked_invalidate(const bitvec<size>& slotmask) {
  542. foreach (i, size) {
  543. if unlikely (slotmask[i]) invalidateslot(i);
  544. }
  545. }
  546. void use(int way) {
  547. evictmap[way] = 1;
  548. if (evictmap.allset()) {
  549. evictmap = 0;
  550. evictmap[way] = 1;
  551. }
  552. }
  553. int probe(base_t target) {
  554. int way = match(target);
  555. if (way < 0) return way;
  556. use(way);
  557. return way;
  558. }
  559. int lru() const {
  560. return (evictmap.allset()) ? 0 : (~evictmap).lsb();
  561. }
  562. int select(base_t target, base_t& oldtag) {
  563. int way = probe(target);
  564. if (way < 0) {
  565. way = lru();
  566. if (evictmap.allset()) evictmap = 0;
  567. oldtag = tagsmirror[way];
  568. update(way, target);
  569. }
  570. use(way);
  571. return way;
  572. }
  573. int select(base_t target) {
  574. base_t dummy;
  575. return select(target, dummy);
  576. }
  577. ostream& printid(ostream& os, int slot) const {
  578. base_t tag = (*this)[slot];
  579. os << intstring(slot, 3), ": ";
  580. os << hexstring(tag, 64);
  581. os << " ";
  582. foreach (i, slices) {
  583. const byte b = *(((byte*)(&tags[i])) + slot);
  584. os << " ", hexstring(b, 8);
  585. }
  586. if (!valid[slot]) os << " <invalid>";
  587. return os;
  588. }
  589. ostream& print(ostream& os) const {
  590. foreach (i, size) {
  591. printid(os, i);
  592. os << endl;
  593. }
  594. return os;
  595. }
  596. };
  597. template <int size, int width, int padsize>
  598. ostream& operator <<(ostream& os, const FullyAssociativeTagsNbitOneHot<size, width, padsize>& tags) {
  599. return tags.print(os);
  600. }
  601. template <typename T, typename V>
  602. struct NullAssociativeArrayStatisticsCollector {
  603. static void inserted(V& elem, T newtag, int way) { }
  604. static void replaced(V& elem, T oldtag, T newtag, int way) { }
  605. static void probed(V& elem, T tag, int way, bool hit) { }
  606. static void overflow(T tag) { }
  607. static void locked(V& slot, T tag, int way) { }
  608. static void unlocked(V& slot, T tag, int way) { }
  609. static void invalidated(V& elem, T oldtag, int way) { }
  610. };
  611. template <typename T, typename V, int ways, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  612. struct FullyAssociativeArray {
  613. FullyAssociativeTags<T, ways> tags;
  614. V data[ways];
  615. FullyAssociativeArray() {
  616. reset();
  617. }
  618. void reset() {
  619. tags.reset();
  620. foreach (i, ways) { data[i].reset(); }
  621. }
  622. V* probe(T tag) {
  623. int way = tags.probe(tag);
  624. stats::probed((way < 0) ? data[0] : data[way], tag, way, (way >= 0));
  625. return (way < 0) ? NULL : &data[way];
  626. }
  627. V* match(T tag) {
  628. int way = tags.match(tag);
  629. return (way < 0) ? NULL : &data[way];
  630. }
  631. V* select(T tag, T& oldtag) {
  632. int way = tags.select(tag, oldtag);
  633. V& slot = data[way];
  634. if ((way >= 0) & (tag == oldtag)) {
  635. stats::probed(slot, tag, way, 1);
  636. } else {
  637. if (oldtag == tags.INVALID)
  638. stats::inserted(slot, tag, way);
  639. else stats::replaced(slot, oldtag, tag, way);
  640. }
  641. return &slot;
  642. }
  643. V* select(T tag) {
  644. T dummy;
  645. return select(tag, dummy);
  646. }
  647. int wayof(const V* line) const {
  648. int way = (line - (const V*)&data);
  649. #if 0
  650. assert(inrange(way, 0, ways-1));
  651. #endif
  652. return way;
  653. }
  654. T tagof(V* line) {
  655. int way = wayof(line);
  656. return tags.tags[way];
  657. }
  658. void invalidate_way(int way) {
  659. stats::invalidated(data[way], tags[way], way);
  660. tags.invalidate_way(way);
  661. data[way].reset();
  662. }
  663. void invalidate_line(V* line) {
  664. invalidate_way(wayof(line));
  665. }
  666. int invalidate(T tag) {
  667. int way = tags.probe(tag);
  668. if (way < 0) return -1;
  669. invalidate_way(way);
  670. return way;
  671. }
  672. V& operator [](int way) { return data[way]; }
  673. V* operator ()(T tag) { return select(tag); }
  674. ostream& print(ostream& os) const {
  675. foreach (i, ways) {
  676. stringbuf sb;
  677. tags.printway(sb, i);
  678. os << padstring(sb, -40), " -> ";
  679. data[i].print(os, tags.tags[i]);
  680. os << endl;
  681. }
  682. return os;
  683. }
  684. };
  685. template <typename T, typename V, int ways>
  686. ostream& operator <<(ostream& os, const FullyAssociativeArray<T, V, ways>& assoc) {
  687. return assoc.print(os);
  688. }
  689. template <typename T, typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  690. struct AssociativeArray {
  691. typedef FullyAssociativeArray<T, V, waycount, stats> Set;
  692. Set sets[setcount];
  693. AssociativeArray() {
  694. reset();
  695. }
  696. void reset() {
  697. foreach (set, setcount) {
  698. sets[set].reset();
  699. }
  700. }
  701. static int setof(T addr) {
  702. return bits(addr, log2(linesize), log2(setcount));
  703. }
  704. static T tagof(T addr) {
  705. return floor(addr, linesize);
  706. }
  707. V* probe(T addr) {
  708. return sets[setof(addr)].probe(tagof(addr));
  709. }
  710. V* match(T addr) {
  711. return sets[setof(addr)].match(tagof(addr));
  712. }
  713. V* select(T addr, T& oldaddr) {
  714. return sets[setof(addr)].select(tagof(addr), oldaddr);
  715. }
  716. V* select(T addr) {
  717. T dummy;
  718. return sets[setof(addr)].select(tagof(addr), dummy);
  719. }
  720. int invalidate(T addr) {
  721. return sets[setof(addr)].invalidate(tagof(addr));
  722. }
  723. ostream& print(ostream& os) const {
  724. os << "AssociativeArray<", setcount, " sets, ", waycount, " ways, ", linesize, "-byte lines>:", endl;
  725. foreach (set, setcount) {
  726. os << " Set ", set, ":", endl;
  727. os << sets[set];
  728. }
  729. return os;
  730. }
  731. };
  732. template <typename T, typename V, int size, int ways, int linesize>
  733. ostream& operator <<(ostream& os, const AssociativeArray<T, V, size, ways, linesize>& aa) {
  734. return aa.print(os);
  735. }
  736. //
  737. // Lockable version of associative arrays:
  738. //
  739. template <typename T, int ways>
  740. struct LockableFullyAssociativeTags {
  741. bitvec<ways> evictmap;
  742. bitvec<ways> unlockedmap;
  743. T tags[ways];
  744. static const T INVALID = InvalidTag<T>::INVALID;
  745. LockableFullyAssociativeTags() {
  746. reset();
  747. }
  748. void reset() {
  749. evictmap = 0;
  750. unlockedmap.setall();
  751. foreach (i, ways) {
  752. tags[i] = INVALID;
  753. }
  754. }
  755. void use(int way) {
  756. evictmap[way] = 1;
  757. // Performance is somewhat better with this off with higher associativity caches:
  758. // if (evictmap.allset()) evictmap = 0;
  759. }
  760. //
  761. // This is a clever way of doing branch-free matching
  762. // with conditional moves and addition. It relies on
  763. // having at most one matching entry in the array;
  764. // otherwise the algorithm breaks:
  765. //
  766. int match(T target) {
  767. int way = 0;
  768. foreach (i, ways) {
  769. way += (tags[i] == target) ? (i + 1) : 0;
  770. }
  771. return way - 1;
  772. }
  773. int probe(T target) {
  774. int way = match(target);
  775. if (way < 0) return -1;
  776. use(way);
  777. return way;
  778. }
  779. int lru() const {
  780. if (!unlockedmap) return -1;
  781. bitvec<ways> w = (~evictmap) & unlockedmap;
  782. return (*w) ? w.lsb() : 0;
  783. }
  784. int select(T target, T& oldtag) {
  785. int way = probe(target);
  786. if (way < 0) {
  787. way = lru();
  788. if (way < 0) return -1;
  789. if (evictmap.allset()) evictmap = 0;
  790. oldtag = tags[way];
  791. tags[way] = target;
  792. }
  793. use(way);
  794. return way;
  795. }
  796. int select(T target) {
  797. T dummy;
  798. return select(target, dummy);
  799. }
  800. int select_and_lock(T tag, bool& firstlock, T& oldtag) {
  801. int way = select(tag, oldtag);
  802. if (way < 0) return way;
  803. firstlock = unlockedmap[way];
  804. lock(way);
  805. return way;
  806. }
  807. int select_and_lock(T tag, bool& firstlock) {
  808. T dummy;
  809. return select_and_lock(tag, firstlock, dummy);
  810. }
  811. int select_and_lock(T target) { bool dummy; return select_and_lock(target, dummy); }
  812. void invalidate_way(int way) {
  813. tags[way] = INVALID;
  814. evictmap[way] = 0;
  815. unlockedmap[way] = 1;
  816. }
  817. int invalidate(T target) {
  818. int way = probe(target);
  819. if (way < 0) return -1;
  820. invalidate_way(way);
  821. }
  822. bool islocked(int way) const { return !unlockedmap[way]; }
  823. void lock(int way) { unlockedmap[way] = 0; }
  824. void unlock(int way) { unlockedmap[way] = 1; }
  825. const T& operator [](int index) const { return tags[index]; }
  826. T& operator [](int index) { return tags[index]; }
  827. int operator ()(T target) { return probe(target); }
  828. stringbuf& printway(stringbuf& os, int i) const {
  829. os << " way ", intstring(i, -2), ": ";
  830. if (tags[i] != INVALID) {
  831. os << "tag 0x", hexstring(tags[i], sizeof(T)*8);
  832. if (evictmap[i]) os << " (MRU)";
  833. if (!unlockedmap[i]) os << " (locked)";
  834. } else {
  835. os << "<invalid>";
  836. }
  837. return os;
  838. }
  839. stringbuf& print(stringbuf& os) const {
  840. foreach (i, ways) {
  841. printway(os, i);
  842. os << endl;
  843. }
  844. return os;
  845. }
  846. ostream& print(ostream& os) const {
  847. stringbuf sb;
  848. print(sb);
  849. os << sb;
  850. return os;
  851. }
  852. };
  853. template <typename T, int ways>
  854. ostream& operator <<(ostream& os, const LockableFullyAssociativeTags<T, ways>& tags) {
  855. return tags.print(os);
  856. }
  857. template <typename T, int ways>
  858. stringbuf& operator <<(stringbuf& sb, const LockableFullyAssociativeTags<T, ways>& tags) {
  859. return tags.print(sb);
  860. }
  861. template <typename T, typename V, int ways, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  862. struct LockableFullyAssociativeArray {
  863. LockableFullyAssociativeTags<T, ways> tags;
  864. V data[ways];
  865. LockableFullyAssociativeArray() {
  866. reset();
  867. }
  868. void reset() {
  869. tags.reset();
  870. foreach (i, ways) { data[i].reset(); }
  871. }
  872. V* probe(T tag) {
  873. int way = tags.probe(tag);
  874. stats::probed((way < 0) ? data[0] : data[way], tag, way, (way >= 0));
  875. return (way < 0) ? NULL : &data[way];
  876. }
  877. V* select(T tag, T& oldtag) {
  878. int way = tags.select(tag, oldtag);
  879. if (way < 0) {
  880. stats::overflow(tag);
  881. return NULL;
  882. }
  883. V& slot = data[way];
  884. if ((way >= 0) & (tag == oldtag)) {
  885. stats::probed(slot, tag, way, 1);
  886. } else {
  887. if (oldtag == tags.INVALID)
  888. stats::inserted(slot, tag, way);
  889. else stats::replaced(slot, oldtag, tag, way);
  890. }
  891. return &slot;
  892. }
  893. V* select(T tag) {
  894. T dummy;
  895. return select(tag, dummy);
  896. }
  897. V* select_and_lock(T tag, bool& firstlock, T& oldtag) {
  898. int way = tags.select_and_lock(tag, firstlock, oldtag);
  899. if (way < 0) {
  900. stats::overflow(tag);
  901. return NULL;
  902. }
  903. V& slot = data[way];
  904. if (tag == oldtag) {
  905. stats::probed(slot, tag, way, 1);
  906. } else {
  907. if (oldtag == tags.INVALID)
  908. stats::inserted(slot, tag, way);
  909. else stats::replaced(slot, oldtag, tag, way);
  910. stats::locked(slot, tag, way);
  911. }
  912. return &slot;
  913. }
  914. V* select_and_lock(T tag, bool& firstlock) {
  915. T dummy;
  916. return select_and_lock(tag, firstlock, dummy);
  917. }
  918. V* select_and_lock(T tag) { bool dummy; return select_and_lock(tag, dummy); }
  919. int wayof(const V* line) const {
  920. int way = (line - (const V*)&data);
  921. #if 0
  922. assert(inrange(way, 0, ways-1));
  923. #endif
  924. return way;
  925. }
  926. T tagof(V* line) {
  927. int way = wayof(line);
  928. return tags.tags[way];
  929. }
  930. void invalidate_way(int way) {
  931. unlock_way(way);
  932. stats::invalidated(data[way], tags[way], way);
  933. tags.invalidate_way(way);
  934. data[way].reset();
  935. }
  936. void invalidate_line(V* line) {
  937. invalidate_way(wayof(line));
  938. }
  939. int invalidate(T tag) {
  940. int way = tags.probe(tag);
  941. if (way < 0) return -1;
  942. invalidate_way(way);
  943. return way;
  944. }
  945. void unlock_way(int way) {
  946. stats::unlocked(data[way], tags[way], way);
  947. tags.unlock(way);
  948. }
  949. void unlock_line(V* line) {
  950. unlock_way(wayof(line));
  951. }
  952. int unlock(T tag) {
  953. int way = tags.probe(tag);
  954. if (way < 0) return;
  955. unlock_way(way);
  956. if (tags.islocked(way)) stats::unlocked(data[way], tags[way], way);
  957. return way;
  958. }
  959. V& operator [](int way) { return data[way]; }
  960. V* operator ()(T tag) { return select(tag); }
  961. ostream& print(ostream& os) const {
  962. foreach (i, ways) {
  963. stringbuf sb;
  964. tags.printway(sb, i);
  965. os << padstring(sb, -40), " -> ";
  966. data[i].print(os, tags.tags[i]);
  967. os << endl;
  968. }
  969. return os;
  970. }
  971. };
  972. template <typename T, typename V, int ways>
  973. ostream& operator <<(ostream& os, const LockableFullyAssociativeArray<T, V, ways>& assoc) {
  974. return assoc.print(os);
  975. }
  976. template <typename T, typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  977. struct LockableAssociativeArray {
  978. typedef LockableFullyAssociativeArray<T, V, waycount, stats> Set;
  979. Set sets[setcount];
  980. LockableAssociativeArray() {
  981. reset();
  982. }
  983. void reset() {
  984. foreach (set, setcount) {
  985. sets[set].reset();
  986. }
  987. }
  988. static int setof(T addr) {
  989. return bits(addr, log2(linesize), log2(setcount));
  990. }
  991. static T tagof(T addr) {
  992. return floor(addr, linesize);
  993. }
  994. V* probe(T addr) {
  995. return sets[setof(addr)].probe(tagof(addr));
  996. }
  997. V* select(T addr, T& oldaddr) {
  998. return sets[setof(addr)].select(tagof(addr), oldaddr);
  999. }
  1000. V* select(T addr) {
  1001. T dummy;
  1002. return select(addr, dummy);
  1003. }
  1004. void invalidate(T addr) {
  1005. sets[setof(addr)].invalidate(tagof(addr));
  1006. }
  1007. V* select_and_lock(T addr, bool& firstlock, T& oldtag) {
  1008. V* line = sets[setof(addr)].select_and_lock(tagof(addr), firstlock, oldtag);
  1009. return line;
  1010. }
  1011. V* select_and_lock(T addr, bool& firstlock) {
  1012. W64 dummy;
  1013. return select_and_lock(addr, firstlock, dummy);
  1014. }
  1015. V* select_and_lock(T addr) { bool dummy; return select_and_lock(addr, dummy); }
  1016. ostream& print(ostream& os) const {
  1017. os << "LockableAssociativeArray<", setcount, " sets, ", waycount, " ways, ", linesize, "-byte lines>:", endl;
  1018. foreach (set, setcount) {
  1019. os << " Set ", set, ":", endl;
  1020. os << sets[set];
  1021. }
  1022. return os;
  1023. }
  1024. };
  1025. template <typename T, typename V, int size, int ways, int linesize>
  1026. ostream& operator <<(ostream& os, const LockableAssociativeArray<T, V, size, ways, linesize>& aa) {
  1027. return aa.print(os);
  1028. }
  1029. template <typename T, int setcount, int linesize>
  1030. struct DefaultCacheIndexingFunction {
  1031. static inline Waddr setof(T address) { return bits(address, log2(linesize), log2(setcount)); }
  1032. };
  1033. template <typename T, int setcount, int linesize>
  1034. struct XORCacheIndexingFunction {
  1035. static inline Waddr setof(T address) {
  1036. address >>= log2(linesize);
  1037. const int tagbits = (sizeof(Waddr) * 8) - log2(linesize);
  1038. address = lowbits(address, tagbits);
  1039. return foldbits<log2(setcount)>(address);
  1040. }
  1041. };
  1042. template <typename T, int setcount, int linesize>
  1043. struct CRCCacheIndexingFunction {
  1044. static inline Waddr setof(T address) {
  1045. Waddr slot = 0;
  1046. address >>= log2(linesize);
  1047. CRC32 crc;
  1048. crc << address;
  1049. W32 v = crc;
  1050. return foldbits<log2(setcount)>(v);
  1051. }
  1052. };
  1053. template <typename T, typename V, int setcount, int waycount, int linesize, typename indexfunc = DefaultCacheIndexingFunction<T, setcount, linesize>, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  1054. struct LockableCommitRollbackAssociativeArray {
  1055. typedef LockableFullyAssociativeArray<T, V, waycount, stats> Set;
  1056. Set sets[setcount];
  1057. struct ClearList {
  1058. W16 set;
  1059. W16 way;
  1060. };
  1061. //
  1062. // Technically (setcount * waycount) will cover everything,
  1063. // but this is not true if we allow lines to be explicitly
  1064. // invalidated between commits. In this case, a potentially
  1065. // unlimited buffer would be required as noted below.
  1066. //
  1067. // Therefore, we choose a small size, above which it becomes
  1068. // more efficient to just invalidate everything without
  1069. // traversing a clear list.
  1070. //
  1071. ClearList clearlist[64];
  1072. int cleartail;
  1073. bool clearlist_exceeded;
  1074. LockableCommitRollbackAssociativeArray() {
  1075. reset();
  1076. }
  1077. void reset() {
  1078. foreach (set, setcount) {
  1079. sets[set].reset();
  1080. }
  1081. cleartail = 0;
  1082. clearlist_exceeded = 0;
  1083. }
  1084. static int setof(T addr) {
  1085. return indexfunc::setof(addr);
  1086. }
  1087. static T tagof(T addr) {
  1088. return floor(addr, linesize);
  1089. }
  1090. V* probe(T addr) {
  1091. return sets[setof(addr)].probe(tagof(addr));
  1092. }
  1093. V* select(T addr, T& oldaddr) {
  1094. return sets[setof(addr)].select(tagof(addr), oldaddr);
  1095. }
  1096. V* select(T addr) {
  1097. T dummy;
  1098. return select(addr, dummy);
  1099. }
  1100. void invalidate(T addr) {
  1101. sets[setof(addr)].invalidate(tagof(addr));
  1102. }
  1103. V* select_and_lock(T addr, bool& firstlock, T& oldtag) {
  1104. V* line = sets[setof(addr)].select_and_lock(tagof(addr), firstlock, oldtag);
  1105. if unlikely (!line) return NULL;
  1106. if likely (firstlock) {
  1107. int set = setof(addr);
  1108. int way = sets[set].wayof(line);
  1109. if unlikely (cleartail >= lengthof(clearlist)) {
  1110. //
  1111. // Too many lines are locked to keep track of: this can
  1112. // happen if some lines are intentionally invalidated
  1113. // before the final commit or rollback; these invalidates
  1114. // do not remove the corresponding slot from the clearlist,
  1115. // so the list may still overflow. In this case, just bulk
  1116. // process every set and every way.
  1117. //
  1118. clearlist_exceeded = 1;
  1119. } else {
  1120. ClearList& c = clearlist[cleartail++];
  1121. c.set = set;
  1122. c.way = way;
  1123. }
  1124. }
  1125. return line;
  1126. }
  1127. V* select_and_lock(T addr, bool& firstlock) {
  1128. W64 dummy;
  1129. return select_and_lock(addr, firstlock, dummy);
  1130. }
  1131. V* select_and_lock(T addr) { bool dummy; return select_and_lock(addr, dummy); }
  1132. void unlock_all_and_invalidate() {
  1133. if unlikely (clearlist_exceeded) {
  1134. foreach (setid, setcount) {
  1135. Set& set = sets[setid];
  1136. foreach (wayid, waycount) set.invalidate_way(wayid);
  1137. }
  1138. } else {
  1139. foreach (i, cleartail) {
  1140. ClearList& c = clearlist[i];
  1141. #if 0
  1142. assert(c.set < setcount);
  1143. assert(c.way < waycount);
  1144. #endif
  1145. Set& set = sets[c.set];
  1146. V& line = set[c.way];
  1147. set.invalidate_line(&line);
  1148. }
  1149. }
  1150. cleartail = 0;
  1151. clearlist_exceeded = 0;
  1152. #if 0
  1153. foreach (s, setcount) {
  1154. Set& set = sets[s];
  1155. foreach (way, waycount) {
  1156. V& line = set[way];
  1157. T tag = set.tagof(&line);
  1158. if ((tag != set.tags.INVALID)) {
  1159. assert(false);
  1160. }
  1161. }
  1162. }
  1163. #endif
  1164. }
  1165. void unlock_all() {
  1166. if unlikely (clearlist_exceeded) {
  1167. foreach (setid, setcount) {
  1168. Set& set = sets[setid];
  1169. foreach (wayid, waycount) set.unlock_way(wayid);
  1170. }
  1171. } else {
  1172. foreach (i, cleartail) {
  1173. ClearList& c = clearlist[i];
  1174. #if 0
  1175. assert(c.set < setcount);
  1176. assert(c.way < waycount);
  1177. #endif
  1178. Set& set = sets[c.set];
  1179. V& line = set[c.way];
  1180. set.unlock_line(&line);
  1181. }
  1182. }
  1183. cleartail = 0;
  1184. clearlist_exceeded = 0;
  1185. }
  1186. ostream& print(ostream& os) const {
  1187. os << "LockableAssociativeArray<", setcount, " sets, ", waycount, " ways, ", linesize, "-byte lines>:", endl;
  1188. foreach (set, setcount) {
  1189. os << " Set ", set, ":", endl;
  1190. os << sets[set];
  1191. }
  1192. return os;
  1193. }
  1194. };
  1195. template <typename T, typename V, int size, int ways, int linesize>
  1196. ostream& operator <<(ostream& os, const LockableCommitRollbackAssociativeArray<T, V, size, ways, linesize>& aa) {
  1197. return aa.print(os);
  1198. }
  1199. //
  1200. // Lockable cache arrays supporting commit/rollback
  1201. //
  1202. // This structure implements the dirty-and-locked scheme to prevent speculative
  1203. // data from propagating to lower levels of the cache hierarchy until it can be
  1204. // committed.
  1205. //
  1206. // Any stores into the cache (signalled by select_and_lock()) back up the old
  1207. // cache line and add this to an array for later rollback purposes.
  1208. //
  1209. // At commit(), all locked lines are unlocked and the backed up cache lines are
  1210. // simply discarded, leaving them free to be replaced or written back.
  1211. //
  1212. // At rollback() all locked lines are invalidated in both this cache and any
  1213. // higher levels (via the invalidate_upwards() callback), thereby forcing
  1214. // clean copies to be refetched as needed after the rollback.
  1215. //
  1216. template <typename T, typename V, int setcount, int waycount, int linesize, int maxdirty, typename stats = NullAssociativeArrayStatisticsCollector<T, V> >
  1217. struct CommitRollbackCache: public LockableCommitRollbackAssociativeArray<T, V, setcount, waycount, linesize, stats> {
  1218. typedef LockableCommitRollbackAssociativeArray<T, V, setcount, waycount, linesize, stats> array_t;
  1219. struct BackupCacheLine {
  1220. W64* addr;
  1221. W64 data[linesize / sizeof(W64)];
  1222. };
  1223. BackupCacheLine stores[maxdirty];
  1224. BackupCacheLine* storetail;
  1225. CommitRollbackCache() {
  1226. reset();
  1227. }
  1228. void reset() {
  1229. array_t::reset();
  1230. storetail = stores;
  1231. }
  1232. //
  1233. // Invalidate lines in higher level caches if needed
  1234. //
  1235. void invalidate_upwards(T addr);
  1236. void invalidate(T addr) {
  1237. array_t::invalidate(addr);
  1238. invalidate_upwards(addr);
  1239. }
  1240. V* select_and_lock(T addr, T& oldaddr) {
  1241. addr = floor(addr, linesize);
  1242. bool firstlock;
  1243. V* line = array_t::select_and_lock(addr, firstlock, oldaddr);
  1244. if (!line) return NULL;
  1245. if (firstlock) {
  1246. W64* linedata = (W64*)addr;
  1247. storetail->addr = linedata;
  1248. foreach (i, lengthof(storetail->data)) storetail->data[i] = linedata[i];
  1249. storetail++;
  1250. }
  1251. return line;
  1252. }
  1253. V* select_and_lock(T addr) {
  1254. T dummy;
  1255. return select_and_lock(addr, dummy);
  1256. }
  1257. void commit() {
  1258. array_t::unlock_all();
  1259. storetail = stores;
  1260. }
  1261. void rollback() {
  1262. array_t::unlock_all_and_invalidate();
  1263. BackupCacheLine* cl = stores;
  1264. while (cl < storetail) {
  1265. W64* linedata = cl->addr;
  1266. foreach (i, lengthof(storetail->data)) linedata[i] = cl->data[i];
  1267. invalidate_upwards((W64)cl->addr);
  1268. cl++;
  1269. }
  1270. storetail = stores;
  1271. }
  1272. void complete() { }
  1273. };
  1274. template <int size, int padsize = 0>
  1275. struct FullyAssociativeTags8bit {
  1276. typedef vec16b vec_t;
  1277. typedef byte base_t;
  1278. static const int chunkcount = (size+15) / 16;
  1279. static const int padchunkcount = (padsize+15) / 16;
  1280. vec_t tags[chunkcount + padchunkcount] alignto(16);
  1281. bitvec<size> valid;
  1282. W64 getvalid() { return valid.integer(); }
  1283. FullyAssociativeTags8bit() {
  1284. reset();
  1285. }
  1286. base_t operator [](int i) const {
  1287. return ((base_t*)&tags)[i];
  1288. }
  1289. base_t& operator [](int i) {
  1290. return ((base_t*)&tags)[i];
  1291. }
  1292. bool isvalid(int index) {
  1293. return valid[index];
  1294. }
  1295. void reset() {
  1296. valid = 0;
  1297. W64* p = (W64*)&tags;
  1298. foreach (i, ((chunkcount + padchunkcount)*16)/8) p[i] = 0xffffffffffffffffLL;
  1299. }
  1300. static const vec_t prep(base_t tag) {
  1301. return x86_sse_dupb(tag);
  1302. }
  1303. int insertslot(int idx, base_t tag) {
  1304. valid[idx] = 1;
  1305. (*this)[idx] = tag;
  1306. return idx;
  1307. }
  1308. int insert(base_t tag) {
  1309. if (valid.allset()) return -1;
  1310. int idx = (~valid).lsb();
  1311. return insertslot(idx, tag);
  1312. }
  1313. bitvec<size> match(const vec_t target) const {
  1314. bitvec<size> m = 0;
  1315. foreach (i, chunkcount) {
  1316. m = m.accum(i*16, 16, x86_sse_pmovmskb(x86_sse_pcmpeqb(target, tags[i])));
  1317. }
  1318. return m & valid;
  1319. }
  1320. bitvec<size> match(base_t target) const {
  1321. return match(prep(target));
  1322. }
  1323. bitvec<size> matchany(const vec_t target) const {
  1324. bitvec<size> m = 0;
  1325. vec_t zero = prep(0);
  1326. foreach (i, chunkcount) {
  1327. m = m.accum(i*16, 16, x86_sse_pmovmskb(x86_sse_pcmpeqb(x86_sse_pandb(tags[i], target), zero)));
  1328. }
  1329. return (~m) & valid;
  1330. }
  1331. bitvec<size> matchany(base_t target) const {
  1332. return matchany(prep(target));
  1333. }
  1334. int search(const vec_t target) const {
  1335. bitvec<size> bitmap = match(target);
  1336. int idx = bitmap.lsb();
  1337. if (!bitmap) idx = -1;
  1338. return idx;
  1339. }
  1340. int extract(const vec_t target) {
  1341. int idx = search(target);
  1342. if (idx >= 0) valid[idx] = 0;
  1343. return idx;
  1344. }
  1345. int search(base_t tag) const {
  1346. return search(prep(tag));
  1347. }
  1348. bitvec<size> extract(base_t tag) {
  1349. return extract(prep(tag));
  1350. }
  1351. void invalidateslot(int index) {
  1352. valid[index] = 0;
  1353. }
  1354. const bitvec<size>& invalidatemask(const bitvec<size>& mask) {
  1355. valid &= ~mask;
  1356. return mask;
  1357. }
  1358. bitvec<size> invalidate(const vec_t target) {
  1359. return invalidatemask(match(target));
  1360. }
  1361. bitvec<size> invalidate(base_t target) {
  1362. return invalidate(prep(target));
  1363. }
  1364. void collapse(int index) {
  1365. base_t* tagbase = (base_t*)&tags;
  1366. base_t* base = tagbase + index;
  1367. vec_t* dp = (vec_t*)base;
  1368. vec_t* sp = (vec_t*)(base + sizeof(base_t));
  1369. foreach (i, chunkcount) {
  1370. x86_sse_stvbu(dp++, x86_sse_ldvbu(sp++));
  1371. }
  1372. valid = valid.remove(index);
  1373. }
  1374. void decrement(base_t amount = 1) {
  1375. foreach (i, chunkcount) { tags[i] = x86_sse_psubusb(tags[i], prep(amount)); }
  1376. }
  1377. void increment(base_t amount = 1) {
  1378. foreach (i, chunkcount) { tags[i] = x86_sse_paddusb(tags[i], prep(amount)); }
  1379. }
  1380. ostream& printid(ostream& os, int slot) const {
  1381. int tag = (*this)[slot];
  1382. if (valid[slot])
  1383. os << intstring(tag, 3);
  1384. else os << "???";
  1385. return os;
  1386. }
  1387. ostream& print(ostream& os) const {
  1388. foreach (i, size) {
  1389. printid(os, i);
  1390. os << " ";
  1391. }
  1392. return os;
  1393. }
  1394. };
  1395. template <int size, int padsize>
  1396. ostream& operator <<(ostream& os, const FullyAssociativeTags8bit<size, padsize>& tags) {
  1397. return tags.print(os);
  1398. }
  1399. template <int size, int padsize = 0>
  1400. struct FullyAssociativeTags16bit {
  1401. typedef vec8w vec_t;
  1402. typedef W16 base_t;
  1403. static const int chunkcount = ((size*2)+15) / 16;
  1404. static const int padchunkcount = ((padsize*2)+15) / 16;
  1405. vec_t tags[chunkcount + padchunkcount] alignto(16);
  1406. bitvec<size> valid;
  1407. W64 getvalid() { return valid.integer(); }
  1408. FullyAssociativeTags16bit() {
  1409. reset();
  1410. }
  1411. base_t operator [](int i) const {
  1412. return ((base_t*)&tags)[i];
  1413. }
  1414. base_t& operator [](int i) {
  1415. return ((base_t*)&tags)[i];
  1416. }
  1417. bool isvalid(int index) {
  1418. return valid[index];
  1419. }
  1420. void reset() {
  1421. valid = 0;
  1422. W64* p = (W64*)&tags;
  1423. foreach (i, ((chunkcount + padchunkcount)*16)/8) p[i] = 0xffffffffffffffffLL;
  1424. }
  1425. static const vec_t prep(base_t tag) {
  1426. return x86_sse_dupw(tag);
  1427. }
  1428. int insertslot(int idx, base_t tag) {
  1429. valid[idx] = 1;
  1430. (*this)[idx] = tag;
  1431. return idx;
  1432. }
  1433. int insert(base_t tag) {
  1434. if (valid.allset()) return -1;
  1435. int idx = (~valid).lsb();
  1436. return insertslot(idx, tag);
  1437. }
  1438. bitvec<size> match(const vec_t target) const {
  1439. bitvec<size> m = 0;
  1440. foreach (i, chunkcount) {
  1441. m = m.accum(i*8, 8, x86_sse_pmovmskw(x86_sse_pcmpeqw(target, tags[i])));
  1442. }
  1443. return m & valid;
  1444. }
  1445. bitvec<size> match(base_t target) const {
  1446. return match(prep(target));
  1447. }
  1448. bitvec<size> matchany(const vec_t target) const {
  1449. bitvec<size> m = 0;
  1450. vec_t zero = prep(0);
  1451. foreach (i, chunkcount) {
  1452. m = m.accum(i*8, 8, x86_sse_pmovmskw(x86_sse_pcmpeqw(x86_sse_pandw(tags[i], target), zero)));
  1453. }
  1454. return (~m) & valid;
  1455. }
  1456. bitvec<size> matchany(base_t target) const {
  1457. return matchany(prep(target));
  1458. }
  1459. int search(const vec_t target) const {
  1460. bitvec<size> bitmap = match(target);
  1461. int idx = bitmap.lsb();
  1462. if (!bitmap) idx = -1;
  1463. return idx;
  1464. }
  1465. int extract(const vec_t target) {
  1466. int idx = search(target);
  1467. if (idx >= 0) valid[idx] = 0;
  1468. return idx;
  1469. }
  1470. int search(base_t tag) const {
  1471. return search(prep(tag));
  1472. }
  1473. bitvec<size> extract(base_t tag) {
  1474. return extract(prep(tag));
  1475. }
  1476. void invalidateslot(int index) {
  1477. valid[index] = 0;
  1478. }
  1479. const bitvec<size>& invalidatemask(const bitvec<size>& mask) {
  1480. valid &= ~mask;
  1481. return mask;
  1482. }
  1483. bitvec<size> invalidate(const vec_t target) {
  1484. return invalidatemask(match(target));
  1485. }
  1486. bitvec<size> invalidate(base_t target) {
  1487. return invalidate(prep(target));
  1488. }
  1489. void collapse(int index) {
  1490. base_t* tagbase = (base_t*)&tags;
  1491. base_t* base = tagbase + index;
  1492. vec_t* dp = (vec_t*)base;
  1493. vec_t* sp = (vec_t*)(base + 1);
  1494. foreach (i, chunkcount) {
  1495. x86_sse_stvwu(dp++, x86_sse_ldvwu(sp++));
  1496. }
  1497. valid = valid.remove(index);
  1498. }
  1499. void decrement(base_t amount = 1) {
  1500. foreach (i, chunkcount) { tags[i] = x86_sse_psubusw(tags[i], prep(amount)); }
  1501. }
  1502. void increment(base_t amount = 1) {
  1503. foreach (i, chunkcount) { tags[i] = x86_sse_paddusw(tags[i], prep(amount)); }
  1504. }
  1505. ostream& printid(ostream& os, int slot) const {
  1506. int tag = (*this)[slot];
  1507. if (valid[slot])
  1508. os << intstring(tag, 3);
  1509. else os << "???";
  1510. return os;
  1511. }
  1512. ostream& print(ostream& os) const {
  1513. foreach (i, size) {
  1514. printid(os, i);
  1515. os << " ";
  1516. }
  1517. return os;
  1518. }
  1519. };
  1520. template <int size, int padsize>
  1521. ostream& operator <<(ostream& os, const FullyAssociativeTags16bit<size, padsize>& tags) {
  1522. return tags.print(os);
  1523. }
  1524. #endif // _LOGIC_H_