PageRenderTime 64ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/rpython/jit/metainterp/optimizeopt/vector.py

https://bitbucket.org/pypy/pypy/
Python | 831 lines | 724 code | 43 blank | 64 comment | 93 complexity | 86d7180386a92f1398ab69ac976e792b MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. """
  2. This is the core of the vec. optimization. It combines dependency.py and schedule.py
  3. to rewrite a loop in vectorized form.
  4. See the rpython doc for more high level details.
  5. """
  6. import py
  7. import time
  8. from rpython.jit.metainterp.jitexc import NotAVectorizeableLoop, NotAProfitableLoop
  9. from rpython.jit.metainterp.compile import (CompileLoopVersionDescr, ResumeDescr)
  10. from rpython.jit.metainterp.history import (INT, FLOAT, VECTOR, ConstInt, ConstFloat,
  11. TargetToken, JitCellToken, AbstractFailDescr)
  12. from rpython.jit.metainterp.optimizeopt.optimizer import Optimizer, Optimization
  13. from rpython.jit.metainterp.optimizeopt.renamer import Renamer
  14. from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph,
  15. MemoryRef, Node, IndexVar)
  16. from rpython.jit.metainterp.optimizeopt.version import LoopVersionInfo
  17. from rpython.jit.metainterp.optimizeopt.schedule import (VecScheduleState,
  18. SchedulerState, Scheduler, Pack, Pair, AccumPack, forwarded_vecinfo)
  19. from rpython.jit.metainterp.optimizeopt.guard import GuardStrengthenOpt
  20. from rpython.jit.metainterp.resoperation import (rop, ResOperation, GuardResOp,
  21. OpHelpers, VecOperation, VectorizationInfo)
  22. from rpython.rlib import listsort
  23. from rpython.rlib.objectmodel import we_are_translated
  24. from rpython.rlib.debug import debug_print, debug_start, debug_stop
  25. from rpython.rlib.jit import Counters
  26. from rpython.rtyper.lltypesystem import lltype, rffi
  27. from rpython.jit.backend.llsupport.symbolic import (WORD as INT_WORD,
  28. SIZEOF_FLOAT as FLOAT_WORD)
  29. def copy_resop(op):
  30. newop = op.copy()
  31. fwd = op.get_forwarded()
  32. if fwd is not None and isinstance(fwd, VectorizationInfo):
  33. newop.set_forwarded(fwd)
  34. return newop
  35. class VectorLoop(object):
  36. def __init__(self, label, oplist, jump):
  37. self.label = label
  38. self.inputargs = label.getarglist_copy()
  39. self.prefix = []
  40. self.prefix_label = None
  41. assert self.label.getopnum() == rop.LABEL
  42. self.operations = oplist
  43. self.jump = jump
  44. assert self.jump.getopnum() == rop.JUMP
  45. def setup_vectorization(self):
  46. for op in self.operations:
  47. op.set_forwarded(VectorizationInfo(op))
  48. def teardown_vectorization(self):
  49. for op in self.operations:
  50. op.set_forwarded(None)
  51. def finaloplist(self, jitcell_token=None, reset_label_token=True, label=False):
  52. oplist = []
  53. if jitcell_token:
  54. if reset_label_token:
  55. token = TargetToken(jitcell_token)
  56. token.original_jitcell_token = jitcell_token
  57. jitcell_token.target_tokens.append(token)
  58. self.label.setdescr(token)
  59. else:
  60. token = self.jump.getdescr()
  61. assert isinstance(token, TargetToken)
  62. if self.prefix_label:
  63. token = TargetToken(jitcell_token)
  64. token.original_jitcell_token = jitcell_token
  65. jitcell_token.target_tokens.append(token)
  66. self.prefix_label.setdescr(token)
  67. self.jump.setdescr(token)
  68. if reset_label_token:
  69. self.jump.setdescr(token)
  70. if self.prefix_label:
  71. oplist = self.prefix + [self.prefix_label]
  72. elif self.prefix:
  73. oplist = self.prefix
  74. if label:
  75. oplist = [self.label] + oplist
  76. return oplist + self.operations + [self.jump]
  77. def clone(self):
  78. renamer = Renamer()
  79. label = copy_resop(self.label)
  80. prefix = []
  81. for op in self.prefix:
  82. newop = copy_resop(op)
  83. renamer.rename(newop)
  84. if not newop.returns_void():
  85. renamer.start_renaming(op, newop)
  86. prefix.append(newop)
  87. prefix_label = None
  88. if self.prefix_label:
  89. prefix_label = copy_resop(self.prefix_label)
  90. renamer.rename(prefix_label)
  91. oplist = []
  92. for op in self.operations:
  93. newop = copy_resop(op)
  94. renamer.rename(newop)
  95. if not newop.returns_void():
  96. renamer.start_renaming(op, newop)
  97. oplist.append(newop)
  98. jump = copy_resop(self.jump)
  99. renamer.rename(jump)
  100. loop = VectorLoop(copy_resop(self.label), oplist, jump)
  101. loop.prefix = prefix
  102. loop.prefix_label = prefix_label
  103. return loop
  104. def optimize_vector(trace, metainterp_sd, jitdriver_sd, warmstate,
  105. loop_info, loop_ops, jitcell_token=None):
  106. """ Enter the world of SIMD. Bails if it cannot transform the trace. """
  107. user_code = not jitdriver_sd.vec and warmstate.vec_all
  108. e = len(loop_ops)-1
  109. assert e > 0
  110. assert rop.is_final(loop_ops[e].getopnum())
  111. loop = VectorLoop(loop_info.label_op, loop_ops[:e], loop_ops[-1])
  112. if user_code and user_loop_bail_fast_path(loop, warmstate):
  113. return loop_info, loop_ops
  114. # the original loop (output of optimize_unroll)
  115. info = LoopVersionInfo(loop_info)
  116. version = info.snapshot(loop)
  117. loop.setup_vectorization()
  118. try:
  119. debug_start("vec-opt-loop")
  120. metainterp_sd.logger_noopt.log_loop([], loop.finaloplist(label=True), -2, None, None, "pre vectorize")
  121. metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
  122. #
  123. start = time.clock()
  124. opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, warmstate.vec_cost)
  125. index_vars = opt.run_optimization(info, loop)
  126. gso = GuardStrengthenOpt(index_vars)
  127. gso.propagate_all_forward(info, loop, user_code)
  128. end = time.clock()
  129. #
  130. metainterp_sd.profiler.count(Counters.OPT_VECTORIZED)
  131. metainterp_sd.logger_noopt.log_loop([], loop.finaloplist(label=True), -2, None, None, "post vectorize")
  132. nano = int((end-start)*10.0**9)
  133. debug_print("# vecopt factor: %d opcount: (%d -> %d) took %dns" % \
  134. (opt.unroll_count+1, len(version.loop.operations), len(loop.operations), nano))
  135. debug_stop("vec-opt-loop")
  136. #
  137. info.label_op = loop.label
  138. return info, loop.finaloplist(jitcell_token=jitcell_token, reset_label_token=False)
  139. except NotAVectorizeableLoop:
  140. debug_stop("vec-opt-loop")
  141. # vectorization is not possible
  142. return loop_info, version.loop.finaloplist()
  143. except NotAProfitableLoop:
  144. debug_stop("vec-opt-loop")
  145. # cost model says to skip this loop
  146. return loop_info, version.loop.finaloplist()
  147. except Exception as e:
  148. debug_stop("vec-opt-loop")
  149. debug_print("failed to vectorize loop. THIS IS A FATAL ERROR!")
  150. if we_are_translated():
  151. from rpython.rtyper.lltypesystem import lltype
  152. from rpython.rtyper.lltypesystem.lloperation import llop
  153. llop.debug_print_traceback(lltype.Void)
  154. else:
  155. raise
  156. finally:
  157. loop.teardown_vectorization()
  158. return loop_info, loop_ops
  159. def user_loop_bail_fast_path(loop, warmstate):
  160. """ In a fast path over the trace loop: try to prevent vecopt
  161. of spending time on a loop that will most probably fail.
  162. """
  163. resop_count = 0 # the count of operations minus debug_merge_points
  164. vector_instr = 0
  165. guard_count = 0
  166. at_least_one_array_access = True
  167. for i,op in enumerate(loop.operations):
  168. if rop.is_jit_debug(op.opnum):
  169. continue
  170. if op.vector >= 0 and not rop.is_guard(op.opnum):
  171. vector_instr += 1
  172. resop_count += 1
  173. if op.is_primitive_array_access():
  174. at_least_one_array_access = True
  175. if warmstate.vec_ratio > 0.0:
  176. # blacklist
  177. if rop.is_call(op.opnum) or rop.is_call_assembler(op.opnum):
  178. return True
  179. if rop.is_guard(op.opnum):
  180. guard_count += 1
  181. if not at_least_one_array_access:
  182. return True
  183. if resop_count > warmstate.vec_length:
  184. return True
  185. if (float(vector_instr)/float(resop_count)) < warmstate.vec_ratio:
  186. return True
  187. if float(guard_count)/float(resop_count) > warmstate.vec_guard_ratio:
  188. return True
  189. return False
  190. class VectorizingOptimizer(Optimizer):
  191. """ Try to unroll the loop and find instructions to group """
  192. def __init__(self, metainterp_sd, jitdriver_sd, cost_threshold):
  193. Optimizer.__init__(self, metainterp_sd, jitdriver_sd)
  194. self.cpu = metainterp_sd.cpu
  195. self.cost_threshold = cost_threshold
  196. self.packset = None
  197. self.unroll_count = 0
  198. self.smallest_type_bytes = 0
  199. self.orig_label_args = None
  200. def run_optimization(self, info, loop):
  201. self.orig_label_args = loop.label.getarglist_copy()
  202. self.linear_find_smallest_type(loop)
  203. byte_count = self.smallest_type_bytes
  204. vsize = self.cpu.vector_register_size
  205. if vsize == 0 or byte_count == 0 or loop.label.getopnum() != rop.LABEL:
  206. # stop, there is no chance to vectorize this trace
  207. # we cannot optimize normal traces (if there is no label)
  208. raise NotAVectorizeableLoop()
  209. # find index guards and move to the earliest position
  210. graph = self.analyse_index_calculations(loop)
  211. if graph is not None:
  212. state = SchedulerState(graph)
  213. self.schedule(state) # reorder the trace
  214. # unroll
  215. self.unroll_count = self.get_unroll_count(vsize)
  216. self.unroll_loop_iterations(loop, self.unroll_count)
  217. # vectorize
  218. graph = DependencyGraph(loop)
  219. self.find_adjacent_memory_refs(graph)
  220. self.extend_packset()
  221. self.combine_packset()
  222. # TODO move cost model to CPU
  223. costmodel = X86_CostModel(self.cpu, self.cost_threshold)
  224. state = VecScheduleState(graph, self.packset, self.cpu, costmodel)
  225. self.schedule(state)
  226. if not state.profitable():
  227. raise NotAProfitableLoop()
  228. return graph.index_vars
  229. def unroll_loop_iterations(self, loop, unroll_count):
  230. """ Unroll the loop X times. unroll_count + 1 = unroll_factor """
  231. numops = len(loop.operations)
  232. renamer = Renamer()
  233. operations = loop.operations
  234. unrolled = []
  235. prohibit_opnums = (rop.GUARD_FUTURE_CONDITION,
  236. rop.GUARD_NOT_INVALIDATED)
  237. orig_jump_args = loop.jump.getarglist()[:]
  238. # it is assumed that #label_args == #jump_args
  239. label_arg_count = len(orig_jump_args)
  240. for u in range(unroll_count):
  241. # fill the map with the renaming boxes. keys are boxes from the label
  242. for i in range(label_arg_count):
  243. la = loop.label.getarg(i)
  244. ja = loop.jump.getarg(i)
  245. ja = renamer.rename_box(ja)
  246. if la != ja:
  247. renamer.start_renaming(la, ja)
  248. #
  249. for i, op in enumerate(operations):
  250. if op.getopnum() in prohibit_opnums:
  251. continue # do not unroll this operation twice
  252. copied_op = copy_resop(op)
  253. if not copied_op.returns_void():
  254. # every result assigns a new box, thus creates an entry
  255. # to the rename map.
  256. renamer.start_renaming(op, copied_op)
  257. #
  258. args = copied_op.getarglist()
  259. for a, arg in enumerate(args):
  260. value = renamer.rename_box(arg)
  261. copied_op.setarg(a, value)
  262. # not only the arguments, but also the fail args need
  263. # to be adjusted. rd_snapshot stores the live variables
  264. # that are needed to resume.
  265. if copied_op.is_guard():
  266. self.copy_guard_descr(renamer, copied_op)
  267. #
  268. unrolled.append(copied_op)
  269. # the jump arguments have been changed
  270. # if label(iX) ... jump(i(X+1)) is called, at the next unrolled loop
  271. # must look like this: label(i(X+1)) ... jump(i(X+2))
  272. args = loop.jump.getarglist()
  273. for i, arg in enumerate(args):
  274. value = renamer.rename_box(arg)
  275. loop.jump.setarg(i, value)
  276. #
  277. loop.operations = operations + unrolled
  278. def copy_guard_descr(self, renamer, copied_op):
  279. descr = copied_op.getdescr()
  280. if descr:
  281. assert isinstance(descr, ResumeDescr)
  282. copied_op.setdescr(descr.clone())
  283. failargs = renamer.rename_failargs(copied_op, clone=True)
  284. copied_op.setfailargs(failargs)
  285. def linear_find_smallest_type(self, loop):
  286. # O(#operations)
  287. for i,op in enumerate(loop.operations):
  288. if op.is_primitive_array_access():
  289. descr = op.getdescr()
  290. byte_count = descr.get_item_size_in_bytes()
  291. if self.smallest_type_bytes == 0 \
  292. or byte_count < self.smallest_type_bytes:
  293. self.smallest_type_bytes = byte_count
  294. def get_unroll_count(self, simd_vec_reg_bytes):
  295. """ This is an estimated number of further unrolls """
  296. # this optimization is not opaque, and needs info about the CPU
  297. byte_count = self.smallest_type_bytes
  298. if byte_count == 0:
  299. return 0
  300. unroll_count = simd_vec_reg_bytes // byte_count
  301. return unroll_count-1 # it is already unrolled once
  302. def find_adjacent_memory_refs(self, graph):
  303. """ The pre pass already builds a hash of memory references and the
  304. operations. Since it is in SSA form there are no array indices.
  305. If there are two array accesses in the unrolled loop
  306. i0,i1 and i1 = int_add(i0,c), then i0 = i0 + 0, i1 = i0 + 1.
  307. They are represented as a linear combination: i*c/d + e, i is a variable,
  308. all others are integers that are calculated in reverse direction
  309. """
  310. loop = graph.loop
  311. operations = loop.operations
  312. self.packset = PackSet(self.cpu.vector_register_size)
  313. memory_refs = graph.memory_refs.items()
  314. # initialize the pack set
  315. for node_a,memref_a in memory_refs:
  316. for node_b,memref_b in memory_refs:
  317. if memref_a is memref_b:
  318. continue
  319. # instead of compare every possible combination and
  320. # exclue a_opidx == b_opidx only consider the ones
  321. # that point forward:
  322. if memref_a.is_adjacent_after(memref_b):
  323. pair = self.packset.can_be_packed(node_a, node_b, None, False)
  324. if pair:
  325. self.packset.add_pack(pair)
  326. def extend_packset(self):
  327. """ Follow dependency chains to find more candidates to put into
  328. pairs.
  329. """
  330. pack_count = self.packset.pack_count()
  331. while True:
  332. i = 0
  333. packs = self.packset.packs
  334. while i < len(packs):
  335. pack = packs[i]
  336. self.follow_def_uses(pack)
  337. i += 1
  338. if pack_count == self.packset.pack_count():
  339. pack_count = self.packset.pack_count()
  340. i = 0
  341. while i < len(packs):
  342. pack = packs[i]
  343. self.follow_use_defs(pack)
  344. i += 1
  345. if pack_count == self.packset.pack_count():
  346. break
  347. pack_count = self.packset.pack_count()
  348. def follow_use_defs(self, pack):
  349. assert pack.numops() == 2
  350. for ldep in pack.leftmost(True).depends():
  351. for rdep in pack.rightmost(True).depends():
  352. lnode = ldep.to
  353. rnode = rdep.to
  354. # only valid if left is in args of pack left
  355. left = lnode.getoperation()
  356. args = pack.leftmost().getarglist()
  357. if left is None or left not in args:
  358. continue
  359. isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
  360. if isomorph and lnode.is_before(rnode):
  361. pair = self.packset.can_be_packed(lnode, rnode, pack, False)
  362. if pair:
  363. self.packset.add_pack(pair)
  364. def follow_def_uses(self, pack):
  365. assert pack.numops() == 2
  366. for ldep in pack.leftmost(node=True).provides():
  367. for rdep in pack.rightmost(node=True).provides():
  368. lnode = ldep.to
  369. rnode = rdep.to
  370. left = pack.leftmost()
  371. args = lnode.getoperation().getarglist()
  372. if left is None or left not in args:
  373. continue
  374. isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
  375. if isomorph and lnode.is_before(rnode):
  376. pair = self.packset.can_be_packed(lnode, rnode, pack, True)
  377. if pair:
  378. self.packset.add_pack(pair)
  379. def combine_packset(self):
  380. """ Combination is done iterating the packs that have
  381. a sorted op index of the first operation (= left).
  382. If a pack is marked as 'full', the next pack that is
  383. encountered having the full_pack.right == pack.left,
  384. the pack is removed. This is because the packs have
  385. intersecting edges.
  386. """
  387. if len(self.packset.packs) == 0:
  388. raise NotAVectorizeableLoop()
  389. i = 0
  390. j = 0
  391. end_ij = len(self.packset.packs)
  392. while True:
  393. len_before = len(self.packset.packs)
  394. i = 0
  395. while i < end_ij:
  396. while j < end_ij and i < end_ij:
  397. if i == j:
  398. # do not pack with itself! won't work...
  399. j += 1
  400. continue
  401. pack1 = self.packset.packs[i]
  402. pack2 = self.packset.packs[j]
  403. if pack1.rightmost_match_leftmost(pack2):
  404. end_ij = self.packset.combine(i,j)
  405. else:
  406. # do not inc in rightmost_match_leftmost
  407. # this could miss some pack
  408. j += 1
  409. i += 1
  410. j = 0
  411. if len_before == len(self.packset.packs):
  412. break
  413. self.packset.split_overloaded_packs()
  414. if not we_are_translated():
  415. # some test cases check the accumulation variables
  416. self.packset.accum_vars = {}
  417. print "packs:"
  418. check = {}
  419. fail = False
  420. for pack in self.packset.packs:
  421. left = pack.operations[0]
  422. right = pack.operations[-1]
  423. if left in check or right in check:
  424. fail = True
  425. check[left] = None
  426. check[right] = None
  427. print " ", pack
  428. if fail:
  429. assert False
  430. def schedule(self, state):
  431. state.prepare()
  432. scheduler = Scheduler()
  433. scheduler.walk_and_emit(state)
  434. if not state.profitable():
  435. return
  436. state.post_schedule()
  437. def analyse_index_calculations(self, loop):
  438. """ Tries to move guarding instructions an all the instructions that
  439. need to be computed for the guard to the loop header. This ensures
  440. that guards fail 'early' and relax dependencies. Without this
  441. step vectorization would not be possible!
  442. """
  443. graph = DependencyGraph(loop)
  444. zero_deps = {}
  445. for node in graph.nodes:
  446. if node.depends_count() == 0:
  447. zero_deps[node] = 0
  448. earlyexit = graph.imaginary_node("early exit")
  449. guards = graph.guards
  450. one_valid = False
  451. valid_guards = []
  452. for guard_node in guards:
  453. modify_later = []
  454. last_prev_node = None
  455. valid = True
  456. if guard_node in zero_deps:
  457. del zero_deps[guard_node]
  458. for prev_dep in guard_node.depends():
  459. prev_node = prev_dep.to
  460. if prev_dep.is_failarg():
  461. # remove this edge later.
  462. # 1) only because of failing, this dependency exists
  463. # 2) non pure operation points to this guard.
  464. # but if this guard only depends on pure operations, it can be checked
  465. # at an earlier position, the non pure op can execute later!
  466. modify_later.append(prev_node)
  467. else:
  468. for path in prev_node.iterate_paths(None, backwards=True, blacklist=True):
  469. if not path.is_always_pure():
  470. valid = False
  471. else:
  472. if path.last() in zero_deps:
  473. del zero_deps[path.last()]
  474. if not valid:
  475. break
  476. if valid:
  477. # transformation is valid, modify the graph and execute
  478. # this guard earlier
  479. one_valid = True
  480. for node in modify_later:
  481. node.remove_edge_to(guard_node)
  482. # every edge that starts in the guard, the early exit
  483. # inherts the edge and guard then provides to early exit
  484. for dep in guard_node.provides()[:]:
  485. assert not dep.target_node().is_imaginary()
  486. earlyexit.edge_to(dep.target_node(), failarg=True)
  487. guard_node.remove_edge_to(dep.target_node())
  488. valid_guards.append(guard_node)
  489. guard_node.edge_to(earlyexit)
  490. self.mark_guard(guard_node, loop)
  491. for node in zero_deps.keys():
  492. assert not node.is_imaginary()
  493. earlyexit.edge_to(node)
  494. if one_valid:
  495. return graph
  496. return None
  497. def mark_guard(self, node, loop):
  498. """ Marks this guard as an early exit! """
  499. op = node.getoperation()
  500. assert isinstance(op, GuardResOp)
  501. if op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE):
  502. descr = CompileLoopVersionDescr()
  503. if op.getdescr():
  504. descr.copy_all_attributes_from(op.getdescr())
  505. op.setdescr(descr)
  506. op.setfailargs(loop.label.getarglist_copy())
  507. class CostModel(object):
  508. """ Utility to estimate the savings for the new trace loop.
  509. The main reaons to have this is of frequent unpack instructions,
  510. and the missing ability (by design) to detect not vectorizable loops.
  511. """
  512. def __init__(self, cpu, threshold):
  513. self.threshold = threshold
  514. self.vec_reg_size = cpu.vector_register_size
  515. self.savings = 0
  516. def reset_savings(self):
  517. self.savings = 0
  518. def record_cast_int(self, op):
  519. raise NotImplementedError
  520. def record_pack_savings(self, pack, times):
  521. raise NotImplementedError
  522. def record_vector_pack(self, box, index, count):
  523. raise NotImplementedError
  524. def record_vector_unpack(self, box, index, count):
  525. raise NotImplementedError
  526. def unpack_cost(self, op, index, count):
  527. raise NotImplementedError
  528. def savings_for_pack(self, pack, times):
  529. raise NotImplementedError
  530. def profitable(self):
  531. return self.savings >= 0
  532. class X86_CostModel(CostModel):
  533. def record_pack_savings(self, pack, times):
  534. cost, benefit_factor = (1,1)
  535. node = pack.operations[0]
  536. op = node.getoperation()
  537. if op.getopnum() == rop.INT_SIGNEXT:
  538. cost, benefit_factor = self.cb_signext(pack)
  539. #
  540. self.savings += benefit_factor * times - cost
  541. def cb_signext(self, pack):
  542. left = pack.leftmost()
  543. if left.cast_to_bytesize() == left.cast_from_bytesize():
  544. return 0, 0
  545. # no benefit for this operation! needs many x86 instrs
  546. return 1,0
  547. def record_cast_int(self, fromsize, tosize, count):
  548. # for each move there is 1 instruction
  549. if fromsize == 8 and tosize == 4 and count == 2:
  550. self.savings -= 1
  551. else:
  552. self.savings += -count
  553. def record_vector_pack(self, src, index, count):
  554. vecinfo = forwarded_vecinfo(src)
  555. if vecinfo.datatype == FLOAT:
  556. if index == 1 and count == 1:
  557. self.savings -= 2
  558. return
  559. self.savings -= count
  560. def record_vector_unpack(self, src, index, count):
  561. self.record_vector_pack(src, index, count)
  562. def isomorphic(l_op, r_op):
  563. """ Subject of definition, here it is equal operation.
  564. See limintations (vectorization.rst).
  565. """
  566. if l_op.getopnum() == r_op.getopnum():
  567. l_vecinfo = forwarded_vecinfo(l_op)
  568. r_vecinfo = forwarded_vecinfo(r_op)
  569. return l_vecinfo.bytesize == r_vecinfo.bytesize
  570. return False
  571. class PackSet(object):
  572. _attrs_ = ('packs', 'vec_reg_size')
  573. def __init__(self, vec_reg_size):
  574. self.packs = []
  575. self.vec_reg_size = vec_reg_size
  576. def pack_count(self):
  577. return len(self.packs)
  578. def add_pack(self, pack):
  579. self.packs.append(pack)
  580. def can_be_packed(self, lnode, rnode, origin_pack, forward):
  581. """ Check to ensure that two nodes might be packed into a Pair.
  582. """
  583. if isomorphic(lnode.getoperation(), rnode.getoperation()):
  584. # even if a guard depends on the previous it is able to
  585. lop = lnode.getoperation()
  586. independent = lnode.independent(rnode)
  587. if independent:
  588. if forward and origin_pack.is_accumulating():
  589. # in this case the splitted accumulator must
  590. # be combined. This case is not supported
  591. raise NotAVectorizeableLoop()
  592. #
  593. if self.contains_pair(lnode, rnode):
  594. return None
  595. #
  596. if origin_pack is None:
  597. op = lnode.getoperation()
  598. if rop.is_primitive_load(op.opnum):
  599. return Pair(lnode, rnode)
  600. else:
  601. return Pair(lnode, rnode)
  602. if self.profitable_pack(lnode, rnode, origin_pack, forward):
  603. return Pair(lnode, rnode)
  604. else:
  605. if self.contains_pair(lnode, rnode):
  606. return None
  607. if origin_pack is not None:
  608. return self.accumulates_pair(lnode, rnode, origin_pack)
  609. return None
  610. def contains_pair(self, lnode, rnode):
  611. for pack in self.packs:
  612. if pack.leftmost(node=True) is lnode or \
  613. pack.rightmost(node=True) is rnode:
  614. return True
  615. return False
  616. def profitable_pack(self, lnode, rnode, origin_pack, forward):
  617. if self.prohibit_packing(origin_pack, origin_pack.leftmost(),
  618. lnode.getoperation(), forward):
  619. return False
  620. if self.prohibit_packing(origin_pack, origin_pack.rightmost(),
  621. rnode.getoperation(), forward):
  622. return False
  623. return True
  624. def prohibit_packing(self, pack, packed, inquestion, forward):
  625. """ Blocks the packing of some operations """
  626. if inquestion.vector == -1:
  627. return True
  628. if packed.is_primitive_array_access():
  629. if packed.getarg(1) is inquestion:
  630. return True
  631. if not forward and inquestion.getopnum() == rop.INT_SIGNEXT:
  632. # prohibit the packing of signext in backwards direction
  633. # the type cannot be determined!
  634. return True
  635. return False
  636. def combine(self, i, j):
  637. """ Combine two packs. It is assumed that the attribute self.packs
  638. is not iterated when calling this method.
  639. """
  640. pkg_a = self.packs[i]
  641. pkg_b = self.packs[j]
  642. operations = pkg_a.operations
  643. for op in pkg_b.operations[1:]:
  644. operations.append(op)
  645. self.packs[i] = pkg_a.clone(operations)
  646. del self.packs[j]
  647. return len(self.packs)
  648. def accumulates_pair(self, lnode, rnode, origin_pack):
  649. # lnode and rnode are isomorphic and dependent
  650. assert isinstance(origin_pack, Pair)
  651. left = lnode.getoperation()
  652. opnum = left.getopnum()
  653. if opnum in AccumPack.SUPPORTED:
  654. right = rnode.getoperation()
  655. assert left.numargs() == 2 and not left.returns_void()
  656. scalar, index = self.getaccumulator_variable(left, right, origin_pack)
  657. if not scalar:
  658. return None
  659. # the dependency exists only because of the left?
  660. for dep in lnode.provides():
  661. if dep.to is rnode:
  662. if not dep.because_of(scalar):
  663. # not quite ... this is not handlable
  664. return None
  665. # get the original variable
  666. scalar = left.getarg(index)
  667. # in either of the two cases the arguments are mixed,
  668. # which is not handled currently
  669. other_index = (index + 1) % 2
  670. if left.getarg(other_index) is not origin_pack.leftmost():
  671. return None
  672. if right.getarg(other_index) is not origin_pack.rightmost():
  673. return None
  674. # this can be handled by accumulation
  675. size = INT_WORD
  676. if left.type == 'f':
  677. size = FLOAT_WORD
  678. l_vecinfo = forwarded_vecinfo(left)
  679. r_vecinfo = forwarded_vecinfo(right)
  680. if not (l_vecinfo.bytesize == r_vecinfo.bytesize and l_vecinfo.bytesize == size):
  681. # do not support if if the type size is smaller
  682. # than the cpu word size.
  683. # WHY?
  684. # to ensure accum is done on the right size, the dependencies
  685. # of leading/preceding signext/floatcast instructions needs to be
  686. # considered. => tree pattern matching problem.
  687. return None
  688. operator = AccumPack.SUPPORTED[opnum]
  689. return AccumPack([lnode, rnode], operator, index)
  690. is_guard = left.is_guard() and left.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE)
  691. if is_guard:
  692. return AccumPack([lnode, rnode], 'g', 0)
  693. return None
  694. def getaccumulator_variable(self, left, right, origin_pack):
  695. for i, arg in enumerate(right.getarglist()):
  696. if arg is left:
  697. return arg, i
  698. return None, -1
  699. def accumulate_prepare(self, state):
  700. vec_reg_size = state.vec_reg_size
  701. for pack in self.packs:
  702. if not pack.is_accumulating():
  703. continue
  704. if pack.leftmost().is_guard():
  705. # guard breaks dependencies, thus it is an accumulation pack
  706. continue
  707. for i,node in enumerate(pack.operations):
  708. op = node.getoperation()
  709. state.accumulation[op] = pack
  710. assert isinstance(pack, AccumPack)
  711. datatype = pack.getdatatype()
  712. bytesize = pack.getbytesize()
  713. count = vec_reg_size // bytesize
  714. signed = datatype == 'i'
  715. oplist = state.invariant_oplist
  716. # reset the box to zeros or ones
  717. if pack.reduce_init() == 0:
  718. vecop = OpHelpers.create_vec(datatype, bytesize, signed, count)
  719. oplist.append(vecop)
  720. vecop = VecOperation(rop.VEC_INT_XOR, [vecop, vecop],
  721. vecop, count)
  722. oplist.append(vecop)
  723. elif pack.reduce_init() == 1:
  724. # multiply is only supported by floats
  725. vecop = OpHelpers.create_vec_expand(ConstFloat(1.0), bytesize,
  726. signed, count)
  727. oplist.append(vecop)
  728. else:
  729. raise NotImplementedError("cannot handle %s" % pack.operator)
  730. # pack the scalar value
  731. args = [vecop, pack.getleftmostseed(), ConstInt(0), ConstInt(1)]
  732. vecop = OpHelpers.create_vec_pack(datatype, args, bytesize,
  733. signed, count)
  734. oplist.append(vecop)
  735. seed = pack.getleftmostseed()
  736. state.accumulation[seed] = pack
  737. # rename the variable with the box
  738. state.setvector_of_box(seed, 0, vecop) # prevent it from expansion
  739. state.renamer.start_renaming(seed, vecop)
  740. def split_overloaded_packs(self):
  741. newpacks = []
  742. for i,pack in enumerate(self.packs):
  743. load = pack.pack_load(self.vec_reg_size)
  744. if load > Pack.FULL:
  745. pack.split(newpacks, self.vec_reg_size)
  746. continue
  747. if load < Pack.FULL:
  748. for op in pack.operations:
  749. op.priority = -100
  750. pack.clear()
  751. self.packs[i] = None
  752. continue
  753. self.packs = [pack for pack in self.packs + newpacks if pack]