PageRenderTime 29ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/compiler/cmm/CmmSink.hs

http://github.com/ghc/ghc
Haskell | 791 lines | 257 code | 89 blank | 445 comment | 21 complexity | a3a08e594c1000ca962f02a370a534c6 MD5 | raw file
Possible License(s): MIT, BSD-3-Clause, GPL-3.0
  1. {-# LANGUAGE GADTs #-}
  2. module CmmSink (
  3. cmmSink
  4. ) where
  5. import Cmm
  6. import CmmOpt
  7. import BlockId
  8. import CmmLive
  9. import CmmUtils
  10. import Hoopl
  11. import CodeGen.Platform
  12. import Platform (isARM, platformArch)
  13. import DynFlags
  14. import UniqFM
  15. import PprCmm ()
  16. import Data.List (partition)
  17. import qualified Data.Set as Set
  18. import Data.Maybe
  19. -- -----------------------------------------------------------------------------
  20. -- Sinking and inlining
  21. -- This is an optimisation pass that
  22. -- (a) moves assignments closer to their uses, to reduce register pressure
  23. -- (b) pushes assignments into a single branch of a conditional if possible
  24. -- (c) inlines assignments to registers that are mentioned only once
  25. -- (d) discards dead assignments
  26. --
  27. -- This tightens up lots of register-heavy code. It is particularly
  28. -- helpful in the Cmm generated by the Stg->Cmm code generator, in
  29. -- which every function starts with a copyIn sequence like:
  30. --
  31. -- x1 = R1
  32. -- x2 = Sp[8]
  33. -- x3 = Sp[16]
  34. -- if (Sp - 32 < SpLim) then L1 else L2
  35. --
  36. -- we really want to push the x1..x3 assignments into the L2 branch.
  37. --
  38. -- Algorithm:
  39. --
  40. -- * Start by doing liveness analysis.
  41. --
  42. -- * Keep a list of assignments A; earlier ones may refer to later ones.
  43. -- Currently we only sink assignments to local registers, because we don't
  44. -- have liveness information about global registers.
  45. --
  46. -- * Walk forwards through the graph, look at each node N:
  47. --
  48. -- * If it is a dead assignment, i.e. assignment to a register that is
  49. -- not used after N, discard it.
  50. --
  51. -- * Try to inline based on current list of assignments
  52. -- * If any assignments in A (1) occur only once in N, and (2) are
  53. -- not live after N, inline the assignment and remove it
  54. -- from A.
  55. --
  56. -- * If an assignment in A is cheap (RHS is local register), then
  57. -- inline the assignment and keep it in A in case it is used afterwards.
  58. --
  59. -- * Otherwise don't inline.
  60. --
  61. -- * If N is assignment to a local register pick up the assignment
  62. -- and add it to A.
  63. --
  64. -- * If N is not an assignment to a local register:
  65. -- * remove any assignments from A that conflict with N, and
  66. -- place them before N in the current block. We call this
  67. -- "dropping" the assignments.
  68. --
  69. -- * An assignment conflicts with N if it:
  70. -- - assigns to a register mentioned in N
  71. -- - mentions a register assigned by N
  72. -- - reads from memory written by N
  73. -- * do this recursively, dropping dependent assignments
  74. --
  75. -- * At an exit node:
  76. -- * drop any assignments that are live on more than one successor
  77. -- and are not trivial
  78. -- * if any successor has more than one predecessor (a join-point),
  79. -- drop everything live in that successor. Since we only propagate
  80. -- assignments that are not dead at the successor, we will therefore
  81. -- eliminate all assignments dead at this point. Thus analysis of a
  82. -- join-point will always begin with an empty list of assignments.
  83. --
  84. --
  85. -- As a result of above algorithm, sinking deletes some dead assignments
  86. -- (transitively, even). This isn't as good as removeDeadAssignments,
  87. -- but it's much cheaper.
  88. -- -----------------------------------------------------------------------------
  89. -- things that we aren't optimising very well yet.
  90. --
  91. -- -----------
  92. -- (1) From GHC's FastString.hashStr:
  93. --
  94. -- s2ay:
  95. -- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
  96. -- c2gn:
  97. -- R1 = _s2au::I64;
  98. -- call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
  99. -- c2gp:
  100. -- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
  101. -- 4091);
  102. -- _s2an::I64 = _s2an::I64 + 1;
  103. -- _s2au::I64 = _s2cO::I64;
  104. -- goto s2ay;
  105. --
  106. -- a nice loop, but we didn't eliminate the silly assignment at the end.
  107. -- See Note [dependent assignments], which would probably fix this.
  108. -- This is #8336 on Trac.
  109. --
  110. -- -----------
  111. -- (2) From stg_atomically_frame in PrimOps.cmm
  112. --
  113. -- We have a diamond control flow:
  114. --
  115. -- x = ...
  116. -- |
  117. -- / \
  118. -- A B
  119. -- \ /
  120. -- |
  121. -- use of x
  122. --
  123. -- Now x won't be sunk down to its use, because we won't push it into
  124. -- both branches of the conditional. We certainly do have to check
  125. -- that we can sink it past all the code in both A and B, but having
  126. -- discovered that, we could sink it to its use.
  127. --
  128. -- -----------------------------------------------------------------------------
  129. type Assignment = (LocalReg, CmmExpr, AbsMem)
  130. -- Assignment caches AbsMem, an abstraction of the memory read by
  131. -- the RHS of the assignment.
  132. type Assignments = [Assignment]
  133. -- A sequence of assignements; kept in *reverse* order
  134. -- So the list [ x=e1, y=e2 ] means the sequence of assignments
  135. -- y = e2
  136. -- x = e1
  137. cmmSink :: DynFlags -> CmmGraph -> CmmGraph
  138. cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
  139. where
  140. liveness = cmmLocalLiveness dflags graph
  141. getLive l = mapFindWithDefault Set.empty l liveness
  142. blocks = postorderDfs graph
  143. join_pts = findJoinPoints blocks
  144. sink :: BlockEnv Assignments -> [CmmBlock] -> [CmmBlock]
  145. sink _ [] = []
  146. sink sunk (b:bs) =
  147. -- pprTrace "sink" (ppr lbl) $
  148. blockJoin first final_middle final_last : sink sunk' bs
  149. where
  150. lbl = entryLabel b
  151. (first, middle, last) = blockSplit b
  152. succs = successors last
  153. -- Annotate the middle nodes with the registers live *after*
  154. -- the node. This will help us decide whether we can inline
  155. -- an assignment in the current node or not.
  156. live = Set.unions (map getLive succs)
  157. live_middle = gen_kill dflags last live
  158. ann_middles = annotate dflags live_middle (blockToList middle)
  159. -- Now sink and inline in this block
  160. (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
  161. fold_last = constantFoldNode dflags last
  162. (final_last, assigs') = tryToInline dflags live fold_last assigs
  163. -- We cannot sink into join points (successors with more than
  164. -- one predecessor), so identify the join points and the set
  165. -- of registers live in them.
  166. (joins, nonjoins) = partition (`mapMember` join_pts) succs
  167. live_in_joins = Set.unions (map getLive joins)
  168. -- We do not want to sink an assignment into multiple branches,
  169. -- so identify the set of registers live in multiple successors.
  170. -- This is made more complicated because when we sink an assignment
  171. -- into one branch, this might change the set of registers that are
  172. -- now live in multiple branches.
  173. init_live_sets = map getLive nonjoins
  174. live_in_multi live_sets r =
  175. case filter (Set.member r) live_sets of
  176. (_one:_two:_) -> True
  177. _ -> False
  178. -- Now, drop any assignments that we will not sink any further.
  179. (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
  180. drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
  181. where
  182. should_drop = conflicts dflags a final_last
  183. || not (isTrivial dflags rhs) && live_in_multi live_sets r
  184. || r `Set.member` live_in_joins
  185. live_sets' | should_drop = live_sets
  186. | otherwise = map upd live_sets
  187. upd set | r `Set.member` set = set `Set.union` live_rhs
  188. | otherwise = set
  189. live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
  190. final_middle = foldl blockSnoc middle' dropped_last
  191. sunk' = mapUnion sunk $
  192. mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
  193. | l <- succs ]
  194. {- TODO: enable this later, when we have some good tests in place to
  195. measure the effect and tune it.
  196. -- small: an expression we don't mind duplicating
  197. isSmall :: CmmExpr -> Bool
  198. isSmall (CmmReg (CmmLocal _)) = True --
  199. isSmall (CmmLit _) = True
  200. isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
  201. isSmall (CmmRegOff (CmmLocal _) _) = True
  202. isSmall _ = False
  203. -}
  204. --
  205. -- We allow duplication of trivial expressions: registers (both local and
  206. -- global) and literals.
  207. --
  208. isTrivial :: DynFlags -> CmmExpr -> Bool
  209. isTrivial _ (CmmReg (CmmLocal _)) = True
  210. isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
  211. if isARM (platformArch (targetPlatform dflags))
  212. then True -- CodeGen.Platform.ARM does not have globalRegMaybe
  213. else isJust (globalRegMaybe (targetPlatform dflags) r)
  214. -- GlobalRegs that are loads from BaseReg are not trivial
  215. isTrivial _ (CmmLit _) = True
  216. isTrivial _ _ = False
  217. --
  218. -- annotate each node with the set of registers live *after* the node
  219. --
  220. annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
  221. annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
  222. where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
  223. --
  224. -- Find the blocks that have multiple successors (join points)
  225. --
  226. findJoinPoints :: [CmmBlock] -> BlockEnv Int
  227. findJoinPoints blocks = mapFilter (>1) succ_counts
  228. where
  229. all_succs = concatMap successors blocks
  230. succ_counts :: BlockEnv Int
  231. succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
  232. --
  233. -- filter the list of assignments to remove any assignments that
  234. -- are not live in a continuation.
  235. --
  236. filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
  237. filterAssignments dflags live assigs = reverse (go assigs [])
  238. where go [] kept = kept
  239. go (a@(r,_,_):as) kept | needed = go as (a:kept)
  240. | otherwise = go as kept
  241. where
  242. needed = r `Set.member` live
  243. || any (conflicts dflags a) (map toNode kept)
  244. -- Note that we must keep assignments that are
  245. -- referred to by other assignments we have
  246. -- already kept.
  247. -- -----------------------------------------------------------------------------
  248. -- Walk through the nodes of a block, sinking and inlining assignments
  249. -- as we go.
  250. --
  251. -- On input we pass in a:
  252. -- * list of nodes in the block
  253. -- * a list of assignments that appeared *before* this block and
  254. -- that are being sunk.
  255. --
  256. -- On output we get:
  257. -- * a new block
  258. -- * a list of assignments that will be placed *after* that block.
  259. --
  260. walk :: DynFlags
  261. -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with
  262. -- the set of registers live *after*
  263. -- this node.
  264. -> Assignments -- The current list of
  265. -- assignments we are sinking.
  266. -- Earlier assignments may refer
  267. -- to later ones.
  268. -> ( Block CmmNode O O -- The new block
  269. , Assignments -- Assignments to sink further
  270. )
  271. walk dflags nodes assigs = go nodes emptyBlock assigs
  272. where
  273. go [] block as = (block, as)
  274. go ((live,node):ns) block as
  275. | shouldDiscard node live = go ns block as
  276. -- discard dead assignment
  277. | Just a <- shouldSink dflags node2 = go ns block (a : as1)
  278. | otherwise = go ns block' as'
  279. where
  280. node1 = constantFoldNode dflags node
  281. (node2, as1) = tryToInline dflags live node1 as
  282. (dropped, as') = dropAssignmentsSimple dflags
  283. (\a -> conflicts dflags a node2) as1
  284. block' = foldl blockSnoc block dropped `blockSnoc` node2
  285. --
  286. -- Heuristic to decide whether to pick up and sink an assignment
  287. -- Currently we pick up all assignments to local registers. It might
  288. -- be profitable to sink assignments to global regs too, but the
  289. -- liveness analysis doesn't track those (yet) so we can't.
  290. --
  291. shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
  292. shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
  293. where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
  294. shouldSink _ _other = Nothing
  295. --
  296. -- discard dead assignments. This doesn't do as good a job as
  297. -- removeDeadAsssignments, because it would need multiple passes
  298. -- to get all the dead code, but it catches the common case of
  299. -- superfluous reloads from the stack that the stack allocator
  300. -- leaves behind.
  301. --
  302. -- Also we catch "r = r" here. You might think it would fall
  303. -- out of inlining, but the inliner will see that r is live
  304. -- after the instruction and choose not to inline r in the rhs.
  305. --
  306. shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
  307. shouldDiscard node live
  308. = case node of
  309. CmmAssign r (CmmReg r') | r == r' -> True
  310. CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
  311. _otherwise -> False
  312. toNode :: Assignment -> CmmNode O O
  313. toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
  314. dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
  315. -> ([CmmNode O O], Assignments)
  316. dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
  317. dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
  318. -> ([CmmNode O O], Assignments)
  319. dropAssignments dflags should_drop state assigs
  320. = (dropped, reverse kept)
  321. where
  322. (dropped,kept) = go state assigs [] []
  323. go _ [] dropped kept = (dropped, kept)
  324. go state (assig : rest) dropped kept
  325. | conflict = go state' rest (toNode assig : dropped) kept
  326. | otherwise = go state' rest dropped (assig:kept)
  327. where
  328. (dropit, state') = should_drop assig state
  329. conflict = dropit || any (conflicts dflags assig) dropped
  330. -- -----------------------------------------------------------------------------
  331. -- Try to inline assignments into a node.
  332. tryToInline
  333. :: DynFlags
  334. -> LocalRegSet -- set of registers live after this
  335. -- node. We cannot inline anything
  336. -- that is live after the node, unless
  337. -- it is small enough to duplicate.
  338. -> CmmNode O x -- The node to inline into
  339. -> Assignments -- Assignments to inline
  340. -> (
  341. CmmNode O x -- New node
  342. , Assignments -- Remaining assignments
  343. )
  344. tryToInline dflags live node assigs = go usages node [] assigs
  345. where
  346. usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
  347. usages = foldLocalRegsUsed dflags addUsage emptyUFM node
  348. go _usages node _skipped [] = (node, [])
  349. go usages node skipped (a@(l,rhs,_) : rest)
  350. | cannot_inline = dont_inline
  351. | occurs_none = discard -- Note [discard during inlining]
  352. | occurs_once = inline_and_discard
  353. | isTrivial dflags rhs = inline_and_keep
  354. | otherwise = dont_inline
  355. where
  356. inline_and_discard = go usages' inl_node skipped rest
  357. where usages' = foldLocalRegsUsed dflags addUsage usages rhs
  358. discard = go usages node skipped rest
  359. dont_inline = keep node -- don't inline the assignment, keep it
  360. inline_and_keep = keep inl_node -- inline the assignment, keep it
  361. keep node' = (final_node, a : rest')
  362. where (final_node, rest') = go usages' node' (l:skipped) rest
  363. usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
  364. usages rhs
  365. -- we must not inline anything that is mentioned in the RHS
  366. -- of a binding that we have already skipped, so we set the
  367. -- usages of the regs on the RHS to 2.
  368. cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
  369. || l `elem` skipped
  370. || not (okToInline dflags rhs node)
  371. l_usages = lookupUFM usages l
  372. l_live = l `elemRegSet` live
  373. occurs_once = not l_live && l_usages == Just 1
  374. occurs_none = not l_live && l_usages == Nothing
  375. inl_node = mapExpDeep inline node
  376. -- mapExpDeep is where the inlining actually takes place!
  377. where inline (CmmReg (CmmLocal l')) | l == l' = rhs
  378. inline (CmmRegOff (CmmLocal l') off) | l == l'
  379. = cmmOffset dflags rhs off
  380. -- re-constant fold after inlining
  381. inline (CmmMachOp op args) = cmmMachOpFold dflags op args
  382. inline other = other
  383. -- Note [dependent assignments]
  384. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  385. --
  386. -- If our assignment list looks like
  387. --
  388. -- [ y = e, x = ... y ... ]
  389. --
  390. -- We cannot inline x. Remember this list is really in reverse order,
  391. -- so it means x = ... y ...; y = e
  392. --
  393. -- Hence if we inline x, the outer assignment to y will capture the
  394. -- reference in x's right hand side.
  395. --
  396. -- In this case we should rename the y in x's right-hand side,
  397. -- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
  398. -- Now we can go ahead and inline x.
  399. --
  400. -- For now we do nothing, because this would require putting
  401. -- everything inside UniqSM.
  402. --
  403. -- One more variant of this (#7366):
  404. --
  405. -- [ y = e, y = z ]
  406. --
  407. -- If we don't want to inline y = e, because y is used many times, we
  408. -- might still be tempted to inline y = z (because we always inline
  409. -- trivial rhs's). But of course we can't, because y is equal to e,
  410. -- not z.
  411. -- Note [discard during inlining]
  412. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  413. -- Opportunities to discard assignments sometimes appear after we've
  414. -- done some inlining. Here's an example:
  415. --
  416. -- x = R1;
  417. -- y = P64[x + 7];
  418. -- z = P64[x + 15];
  419. -- /* z is dead */
  420. -- R1 = y & (-8);
  421. --
  422. -- The x assignment is trivial, so we inline it in the RHS of y, and
  423. -- keep both x and y. z gets dropped because it is dead, then we
  424. -- inline y, and we have a dead assignment to x. If we don't notice
  425. -- that x is dead in tryToInline, we end up retaining it.
  426. addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
  427. addUsage m r = addToUFM_C (+) m r 1
  428. regsUsedIn :: [LocalReg] -> CmmExpr -> Bool
  429. regsUsedIn [] _ = False
  430. regsUsedIn ls e = wrapRecExpf f e False
  431. where f (CmmReg (CmmLocal l)) _ | l `elem` ls = True
  432. f (CmmRegOff (CmmLocal l) _) _ | l `elem` ls = True
  433. f _ z = z
  434. -- we don't inline into CmmUnsafeForeignCall if the expression refers
  435. -- to global registers. This is a HACK to avoid global registers
  436. -- clashing with C argument-passing registers, really the back-end
  437. -- ought to be able to handle it properly, but currently neither PprC
  438. -- nor the NCG can do it. See Note [Register parameter passing]
  439. -- See also StgCmmForeign:load_args_into_temps.
  440. okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
  441. okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
  442. not (globalRegistersConflict dflags expr node)
  443. okToInline _ _ _ = True
  444. -- -----------------------------------------------------------------------------
  445. -- | @conflicts (r,e) node@ is @False@ if and only if the assignment
  446. -- @r = e@ can be safely commuted past statement @node@.
  447. conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
  448. conflicts dflags (r, rhs, addr) node
  449. -- (1) node defines registers used by rhs of assignment. This catches
  450. -- assignments and all three kinds of calls. See Note [Sinking and calls]
  451. | globalRegistersConflict dflags rhs node = True
  452. | localRegistersConflict dflags rhs node = True
  453. -- (2) node uses register defined by assignment
  454. | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True
  455. -- (3) a store to an address conflicts with a read of the same memory
  456. | CmmStore addr' e <- node
  457. , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
  458. -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
  459. | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True
  460. | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
  461. | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True
  462. -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
  463. | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True
  464. -- (6) native calls clobber any memory
  465. | CmmCall{} <- node, memConflicts addr AnyMem = True
  466. -- (7) otherwise, no conflict
  467. | otherwise = False
  468. -- Returns True if node defines any global registers that are used in the
  469. -- Cmm expression
  470. globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
  471. globalRegistersConflict dflags expr node =
  472. foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
  473. False node
  474. -- Returns True if node defines any local registers that are used in the
  475. -- Cmm expression
  476. localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
  477. localRegistersConflict dflags expr node =
  478. foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr)
  479. False node
  480. -- Note [Sinking and calls]
  481. -- ~~~~~~~~~~~~~~~~~~~~~~~~
  482. --
  483. -- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
  484. -- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
  485. -- stack layout (see Note [Sinking after stack layout]) which leads to two
  486. -- invariants related to calls:
  487. --
  488. -- a) during stack layout phase all safe foreign calls are turned into
  489. -- unsafe foreign calls (see Note [Lower safe foreign calls]). This
  490. -- means that we will never encounter CmmForeignCall node when running
  491. -- sinking after stack layout
  492. --
  493. -- b) stack layout saves all variables live across a call on the stack
  494. -- just before making a call (remember we are not sinking assignments to
  495. -- stack):
  496. --
  497. -- L1:
  498. -- x = R1
  499. -- P64[Sp - 16] = L2
  500. -- P64[Sp - 8] = x
  501. -- Sp = Sp - 16
  502. -- call f() returns L2
  503. -- L2:
  504. --
  505. -- We will attempt to sink { x = R1 } but we will detect conflict with
  506. -- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even
  507. -- checking whether it conflicts with { call f() }. In this way we will
  508. -- never need to check any assignment conflicts with CmmCall. Remember
  509. -- that we still need to check for potential memory conflicts.
  510. --
  511. -- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
  512. -- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
  513. -- This assumption holds only when we do sinking after stack layout. If we run
  514. -- it before stack layout we need to check for possible conflicts with all three
  515. -- kinds of calls. Our `conflicts` function does that by using a generic
  516. -- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
  517. -- UserOfRegs typeclasses.
  518. --
  519. -- An abstraction of memory read or written.
  520. data AbsMem
  521. = NoMem -- no memory accessed
  522. | AnyMem -- arbitrary memory
  523. | HeapMem -- definitely heap memory
  524. | StackMem -- definitely stack memory
  525. | SpMem -- <size>[Sp+n]
  526. {-# UNPACK #-} !Int
  527. {-# UNPACK #-} !Int
  528. -- Having SpMem is important because it lets us float loads from Sp
  529. -- past stores to Sp as long as they don't overlap, and this helps to
  530. -- unravel some long sequences of
  531. -- x1 = [Sp + 8]
  532. -- x2 = [Sp + 16]
  533. -- ...
  534. -- [Sp + 8] = xi
  535. -- [Sp + 16] = xj
  536. --
  537. -- Note that SpMem is invalidated if Sp is changed, but the definition
  538. -- of 'conflicts' above handles that.
  539. -- ToDo: this won't currently fix the following commonly occurring code:
  540. -- x1 = [R1 + 8]
  541. -- x2 = [R1 + 16]
  542. -- ..
  543. -- [Hp - 8] = x1
  544. -- [Hp - 16] = x2
  545. -- ..
  546. -- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that
  547. -- assignments to [Hp + n] do not conflict with any other heap memory,
  548. -- but this is tricky to nail down. What if we had
  549. --
  550. -- x = Hp + n
  551. -- [x] = ...
  552. --
  553. -- the store to [x] should be "new heap", not "old heap".
  554. -- Furthermore, you could imagine that if we started inlining
  555. -- functions in Cmm then there might well be reads of heap memory
  556. -- that was written in the same basic block. To take advantage of
  557. -- non-aliasing of heap memory we will have to be more clever.
  558. -- Note [Foreign calls clobber heap]
  559. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  560. --
  561. -- It is tempting to say that foreign calls clobber only
  562. -- non-heap/stack memory, but unfortunately we break this invariant in
  563. -- the RTS. For example, in stg_catch_retry_frame we call
  564. -- stmCommitNestedTransaction() which modifies the contents of the
  565. -- TRec it is passed (this actually caused incorrect code to be
  566. -- generated).
  567. --
  568. -- Since the invariant is true for the majority of foreign calls,
  569. -- perhaps we ought to have a special annotation for calls that can
  570. -- modify heap/stack memory. For now we just use the conservative
  571. -- definition here.
  572. --
  573. -- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
  574. -- therefore we should never float any memory operations across one of
  575. -- these calls.
  576. bothMems :: AbsMem -> AbsMem -> AbsMem
  577. bothMems NoMem x = x
  578. bothMems x NoMem = x
  579. bothMems HeapMem HeapMem = HeapMem
  580. bothMems StackMem StackMem = StackMem
  581. bothMems (SpMem o1 w1) (SpMem o2 w2)
  582. | o1 == o2 = SpMem o1 (max w1 w2)
  583. | otherwise = StackMem
  584. bothMems SpMem{} StackMem = StackMem
  585. bothMems StackMem SpMem{} = StackMem
  586. bothMems _ _ = AnyMem
  587. memConflicts :: AbsMem -> AbsMem -> Bool
  588. memConflicts NoMem _ = False
  589. memConflicts _ NoMem = False
  590. memConflicts HeapMem StackMem = False
  591. memConflicts StackMem HeapMem = False
  592. memConflicts SpMem{} HeapMem = False
  593. memConflicts HeapMem SpMem{} = False
  594. memConflicts (SpMem o1 w1) (SpMem o2 w2)
  595. | o1 < o2 = o1 + w1 > o2
  596. | otherwise = o2 + w2 > o1
  597. memConflicts _ _ = True
  598. exprMem :: DynFlags -> CmmExpr -> AbsMem
  599. exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
  600. exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es)
  601. exprMem _ _ = NoMem
  602. loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
  603. loadAddr dflags e w =
  604. case e of
  605. CmmReg r -> regAddr dflags r 0 w
  606. CmmRegOff r i -> regAddr dflags r i w
  607. _other | regUsedIn dflags (CmmGlobal Sp) e -> StackMem
  608. | otherwise -> AnyMem
  609. regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
  610. regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
  611. regAddr _ (CmmGlobal Hp) _ _ = HeapMem
  612. regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
  613. regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
  614. regAddr _ _ _ _ = AnyMem
  615. {-
  616. Note [Inline GlobalRegs?]
  617. Should we freely inline GlobalRegs?
  618. Actually it doesn't make a huge amount of difference either way, so we
  619. *do* currently treat GlobalRegs as "trivial" and inline them
  620. everywhere, but for what it's worth, here is what I discovered when I
  621. (SimonM) looked into this:
  622. Common sense says we should not inline GlobalRegs, because when we
  623. have
  624. x = R1
  625. the register allocator will coalesce this assignment, generating no
  626. code, and simply record the fact that x is bound to $rbx (or
  627. whatever). Furthermore, if we were to sink this assignment, then the
  628. range of code over which R1 is live increases, and the range of code
  629. over which x is live decreases. All things being equal, it is better
  630. for x to be live than R1, because R1 is a fixed register whereas x can
  631. live in any register. So we should neither sink nor inline 'x = R1'.
  632. However, not inlining GlobalRegs can have surprising
  633. consequences. e.g. (cgrun020)
  634. c3EN:
  635. _s3DB::P64 = R1;
  636. _c3ES::P64 = _s3DB::P64 & 7;
  637. if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
  638. c3EU:
  639. _s3DD::P64 = P64[_s3DB::P64 + 6];
  640. _s3DE::P64 = P64[_s3DB::P64 + 14];
  641. I64[Sp - 8] = c3F0;
  642. R1 = _s3DE::P64;
  643. P64[Sp] = _s3DD::P64;
  644. inlining the GlobalReg gives:
  645. c3EN:
  646. if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
  647. c3EU:
  648. I64[Sp - 8] = c3F0;
  649. _s3DD::P64 = P64[R1 + 6];
  650. R1 = P64[R1 + 14];
  651. P64[Sp] = _s3DD::P64;
  652. but if we don't inline the GlobalReg, instead we get:
  653. _s3DB::P64 = R1;
  654. if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
  655. c3EU:
  656. I64[Sp - 8] = c3F0;
  657. R1 = P64[_s3DB::P64 + 14];
  658. P64[Sp] = P64[_s3DB::P64 + 6];
  659. This looks better - we managed to inline _s3DD - but in fact it
  660. generates an extra reg-reg move:
  661. .Lc3EU:
  662. movq $c3F0_info,-8(%rbp)
  663. movq %rbx,%rax
  664. movq 14(%rbx),%rbx
  665. movq 6(%rax),%rax
  666. movq %rax,(%rbp)
  667. because _s3DB is now live across the R1 assignment, we lost the
  668. benefit of coalescing.
  669. Who is at fault here? Perhaps if we knew that _s3DB was an alias for
  670. R1, then we would not sink a reference to _s3DB past the R1
  671. assignment. Or perhaps we *should* do that - we might gain by sinking
  672. it, despite losing the coalescing opportunity.
  673. Sometimes not inlining global registers wins by virtue of the rule
  674. about not inlining into arguments of a foreign call, e.g. (T7163) this
  675. is what happens when we inlined F1:
  676. _s3L2::F32 = F1;
  677. _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
  678. (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32);
  679. but if we don't inline F1:
  680. (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
  681. 10.0 :: W32));
  682. -}