/Data/Array/Accelerate/CUDA/Compile.hs
Haskell | 492 lines | 312 code | 70 blank | 110 comment | 11 complexity | f7cdf4cb06ebffabfda6ce1f8d7c2618 MD5 | raw file
Possible License(s): BSD-3-Clause
- {-# LANGUAGE CPP #-}
- {-# LANGUAGE GADTs #-}
- {-# LANGUAGE ScopedTypeVariables #-}
- {-# LANGUAGE TupleSections #-}
- {-# OPTIONS_GHC -fno-warn-orphans #-}
- -- |
- -- Module : Data.Array.Accelerate.CUDA.Compile
- -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee
- -- [2009..2012] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell
- -- License : BSD3
- --
- -- Maintainer : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
- -- Stability : experimental
- -- Portability : non-portable (GHC extensions)
- --
- module Data.Array.Accelerate.CUDA.Compile (
- -- * generate and compile kernels to realise a computation
- compileAcc, compileAfun
- ) where
- #include "accelerate.h"
- -- friends
- import Data.Array.Accelerate.Tuple
- import Data.Array.Accelerate.CUDA.AST
- import Data.Array.Accelerate.CUDA.State
- import Data.Array.Accelerate.CUDA.CodeGen
- import Data.Array.Accelerate.CUDA.Array.Sugar
- import Data.Array.Accelerate.CUDA.Analysis.Launch
- import Data.Array.Accelerate.CUDA.Foreign (canExecute)
- import Data.Array.Accelerate.CUDA.Persistent as KT
- import qualified Data.Array.Accelerate.CUDA.FullList as FL
- import qualified Data.Array.Accelerate.CUDA.Debug as D
- -- libraries
- import Numeric
- import Prelude hiding ( exp, scanl, scanr )
- import Control.Applicative hiding ( Const )
- import Control.Exception
- import Control.Monad
- import Control.Monad.Reader ( asks )
- import Control.Monad.State ( gets )
- import Control.Monad.Trans ( liftIO, MonadIO )
- import Control.Concurrent
- import Crypto.Hash.MD5 ( hashlazy )
- import Data.List ( intercalate )
- import Data.Maybe
- import Data.Monoid
- import System.Directory
- import System.Exit ( ExitCode(..) )
- import System.FilePath
- import System.IO
- import System.IO.Error
- import System.IO.Unsafe
- import System.Process
- import Text.PrettyPrint.Mainland ( ppr, renderCompact, displayLazyText )
- import qualified Data.ByteString as B
- import qualified Data.Text.Lazy as T
- import qualified Data.Text.Lazy.IO as T
- import qualified Data.Text.Lazy.Encoding as T
- import qualified Control.Concurrent.MSem as Q
- import qualified Foreign.CUDA.Driver as CUDA
- import qualified Foreign.CUDA.Analysis as CUDA
- import GHC.Conc ( getNumProcessors )
- #ifdef VERSION_unix
- import System.Posix.Process
- #else
- import System.Win32.Process
- #endif
- #ifndef SIZEOF_HSINT
- import Foreign.Storable
- #endif
- import Paths_accelerate_cuda ( getDataDir )
- -- Keep track of which kernels have been linked into which contexts. We use the
- -- context as a lookup key, which requires equality.
- --
- instance Eq CUDA.Context where
- CUDA.Context c1 == CUDA.Context c2 = c1 == c2
- -- | Initiate code generation, compilation, and data transfer for an array
- -- expression. The returned array computation is annotated so to be suitable for
- -- execution in the CUDA environment. This includes:
- --
- -- * list of array variables embedded within scalar expressions
- --
- -- * kernel object(s) required to executed the kernel
- --
- compileAcc :: Acc a -> CIO (ExecAcc a)
- compileAcc = prepareOpenAcc
- compileAfun :: Afun f -> CIO (ExecAfun f)
- compileAfun = prepareOpenAfun
- prepareOpenAfun :: OpenAfun aenv f -> CIO (PreOpenAfun ExecOpenAcc aenv f)
- prepareOpenAfun (Alam l) = Alam <$> prepareOpenAfun l
- prepareOpenAfun (Abody b) = Abody <$> prepareOpenAcc b
- prepareOpenAcc :: OpenAcc aenv a -> CIO (ExecOpenAcc aenv a)
- prepareOpenAcc rootAcc = traverseAcc rootAcc
- where
- -- Traverse an open array expression in depth-first order
- --
- -- The applicative combinators are used to gloss over that we are passing
- -- around the AST nodes together with a set of free variable indices that
- -- are merged at every step.
- --
- traverseAcc :: forall aenv arrs. OpenAcc aenv arrs -> CIO (ExecOpenAcc aenv arrs)
- traverseAcc acc@(OpenAcc pacc) =
- case pacc of
- -- Environment and control flow
- Avar ix -> node $ pure (Avar ix)
- Alet a b -> node . pure =<< Alet <$> traverseAcc a <*> traverseAcc b
- Apply f a -> node . pure =<< Apply <$> compileAfun f <*> traverseAcc a
- Acond p t e -> node =<< liftA3 Acond <$> travE p <*> travA t <*> travA e
- Atuple tup -> node =<< liftA Atuple <$> travAtup tup
- Aprj ix tup -> node =<< liftA (Aprj ix) <$> travA tup
- -- Array injection
- Unit e -> node =<< liftA Unit <$> travE e
- Use arrs -> use (arrays (undefined::arrs)) arrs >> node (pure $ Use arrs)
- -- Index space transforms
- Reshape s a -> node =<< liftA2 Reshape <$> travE s <*> travA a
- Replicate slix e a -> exec =<< liftA2 (Replicate slix) <$> travE e <*> travA a
- Slice slix a e -> exec =<< liftA2 (Slice slix) <$> travA a <*> travE e
- Backpermute e f a -> exec =<< liftA3 Backpermute <$> travE e <*> travF f <*> travD a
- -- Producers
- Generate e f -> exec =<< liftA2 Generate <$> travE e <*> travF f
- Map f a -> exec =<< liftA2 Map <$> travF f <*> travD a
- ZipWith f a b -> exec =<< liftA3 ZipWith <$> travF f <*> travD a <*> travD b
- Transform e p f a -> exec =<< liftA4 Transform <$> travE e <*> travF p <*> travF f <*> travD a
- -- Consumers
- Fold f z a -> exec =<< liftA3 Fold <$> travF f <*> travE z <*> travD a
- Fold1 f a -> exec =<< liftA2 Fold1 <$> travF f <*> travD a
- FoldSeg f e a s -> exec =<< liftA4 FoldSeg <$> travF f <*> travE e <*> travD a <*> travD s
- Fold1Seg f a s -> exec =<< liftA3 Fold1Seg <$> travF f <*> travD a <*> travD s
- Scanl f e a -> exec =<< liftA3 Scanl <$> travF f <*> travE e <*> travD a
- Scanl' f e a -> exec =<< liftA3 Scanl' <$> travF f <*> travE e <*> travD a
- Scanl1 f a -> exec =<< liftA2 Scanl1 <$> travF f <*> travD a
- Scanr f e a -> exec =<< liftA3 Scanr <$> travF f <*> travE e <*> travD a
- Scanr' f e a -> exec =<< liftA3 Scanr' <$> travF f <*> travE e <*> travD a
- Scanr1 f a -> exec =<< liftA2 Scanr1 <$> travF f <*> travD a
- Permute f d g a -> exec =<< liftA4 Permute <$> travF f <*> travA d <*> travF g <*> travD a
- Stencil f b a -> exec =<< liftA2 (flip Stencil b) <$> travF f <*> travA a
- Stencil2 f b1 a1 b2 a2 -> exec =<< liftA3 stencil2 <$> travF f <*> travA a1 <*> travA a2
- where stencil2 f' a1' a2' = Stencil2 f' b1 a1' b2 a2'
- Foreign ff afun a -> case canExecute ff of
- -- If it's a foreign call for the CUDA backend don't bother compiling the pure version
- (Just _) -> node =<< liftA (Foreign ff foreignError) <$> travA a
- Nothing -> node . pure =<< Foreign ff <$> compileAfun afun <*> traverseAcc a
- where
- use :: ArraysR a -> a -> CIO ()
- use ArraysRunit () = return ()
- use ArraysRarray arr = useArray arr
- use (ArraysRpair r1 r2) (a1, a2) = use r1 a1 >> use r2 a2
- exec :: (Gamma aenv, PreOpenAcc ExecOpenAcc aenv arrs) -> CIO (ExecOpenAcc aenv arrs)
- exec (aenv, eacc) = do
- kernel <- build acc aenv
- return $! ExecAcc (fullOfList kernel) aenv eacc
- node :: (Gamma aenv', PreOpenAcc ExecOpenAcc aenv' arrs') -> CIO (ExecOpenAcc aenv' arrs')
- node = fmap snd . wrap
- wrap :: (Gamma aenv', PreOpenAcc ExecOpenAcc aenv' arrs') -> CIO (Gamma aenv', ExecOpenAcc aenv' arrs')
- wrap = return . liftA (ExecAcc noKernel mempty)
- travA :: OpenAcc aenv' a' -> CIO (Gamma aenv', ExecOpenAcc aenv' a')
- travA a = pure <$> traverseAcc a
- travD :: (Shape sh, Elt e) => OpenAcc aenv (Array sh e) -> CIO (Gamma aenv, ExecOpenAcc aenv (Array sh e))
- travD (OpenAcc delayed) =
- case delayed of
- Avar ix -> wrap (freevar ix, Avar ix)
- Map f a -> wrap =<< liftA2 Map <$> travF f <*> travD a
- Generate e f -> wrap =<< liftA2 Generate <$> travE e <*> travF f
- Backpermute e f a -> wrap =<< liftA3 Backpermute <$> travE e <*> travF f <*> travD a
- Transform e p f a -> wrap =<< liftA4 Transform <$> travE e <*> travF p <*> travF f <*> travD a
- _ -> INTERNAL_ERROR(error) "compile" "expected fused/delayable array"
- travAtup :: Atuple (OpenAcc aenv) a -> CIO (Gamma aenv, Atuple (ExecOpenAcc aenv) a)
- travAtup NilAtup = return (pure NilAtup)
- travAtup (SnocAtup t a) = liftA2 SnocAtup <$> travAtup t <*> travA a
- travF :: OpenFun env aenv t -> CIO (Gamma aenv, PreOpenFun ExecOpenAcc env aenv t)
- travF (Body b) = liftA Body <$> travE b
- travF (Lam f) = liftA Lam <$> travF f
- noKernel :: FL.FullList () (AccKernel a)
- noKernel = FL.FL () (INTERNAL_ERROR(error) "compile" "no kernel module for this node") FL.Nil
- fullOfList :: [a] -> FL.FullList () a
- fullOfList [] = INTERNAL_ERROR(error) "fullList" "empty list"
- fullOfList [x] = FL.singleton () x
- fullOfList (x:xs) = FL.cons () x (fullOfList xs)
- foreignError = INTERNAL_ERROR(error) "compile" $ "Didn't compile the pure version of a foreign function call but" ++
- " it looks like it's being executed anyway"
- -- Traverse a scalar expression
- --
- travE :: OpenExp env aenv e
- -> CIO (Gamma aenv, PreOpenExp ExecOpenAcc env aenv e)
- travE exp =
- case exp of
- Var ix -> return $ pure (Var ix)
- Const c -> return $ pure (Const c)
- PrimConst c -> return $ pure (PrimConst c)
- IndexAny -> return $ pure IndexAny
- IndexNil -> return $ pure IndexNil
- --
- Let a b -> liftA2 Let <$> travE a <*> travE b
- IndexCons t h -> liftA2 IndexCons <$> travE t <*> travE h
- IndexHead h -> liftA IndexHead <$> travE h
- IndexTail t -> liftA IndexTail <$> travE t
- IndexSlice slix x s -> liftA2 (IndexSlice slix) <$> travE x <*> travE s
- IndexFull slix x s -> liftA2 (IndexFull slix) <$> travE x <*> travE s
- ToIndex s i -> liftA2 ToIndex <$> travE s <*> travE i
- FromIndex s i -> liftA2 FromIndex <$> travE s <*> travE i
- Tuple t -> liftA Tuple <$> travT t
- Prj ix e -> liftA (Prj ix) <$> travE e
- Cond p t e -> liftA3 Cond <$> travE p <*> travE t <*> travE e
- Iterate n f x -> liftA3 Iterate <$> travE n <*> travE f <*> travE x
- -- While p f x -> liftA3 While <$> travE p <*> travE f <*> travE x
- PrimApp f e -> liftA (PrimApp f) <$> travE e
- Index a e -> liftA2 Index <$> travA a <*> travE e
- LinearIndex a e -> liftA2 LinearIndex <$> travA a <*> travE e
- Shape a -> liftA Shape <$> travA a
- ShapeSize e -> liftA ShapeSize <$> travE e
- Intersect x y -> liftA2 Intersect <$> travE x <*> travE y
- where
- travA :: (Shape sh, Elt e)
- => OpenAcc aenv (Array sh e) -> CIO (Gamma aenv, ExecOpenAcc aenv (Array sh e))
- travA a = do
- a' <- traverseAcc a
- return $ (bind a', a')
- travT :: Tuple (OpenExp env aenv) t
- -> CIO (Gamma aenv, Tuple (PreOpenExp ExecOpenAcc env aenv) t)
- travT NilTup = return (pure NilTup)
- travT (SnocTup t e) = liftA2 SnocTup <$> travT t <*> travE e
- bind :: (Shape sh, Elt e) => ExecOpenAcc aenv (Array sh e) -> Gamma aenv
- bind (ExecAcc _ _ (Avar ix)) = freevar ix
- bind _ = INTERNAL_ERROR(error) "bind" "expected array variable"
- -- Applicative
- -- -----------
- --
- liftA4 :: Applicative f => (a -> b -> c -> d -> e) -> f a -> f b -> f c -> f d -> f e
- liftA4 f a b c d = f <$> a <*> b <*> c <*> d
- -- Compilation
- -- -----------
- -- Generate, compile, and link code to evaluate an array computation. We use
- -- 'unsafePerformIO' here to leverage laziness, so that the 'link' function
- -- evaluates and blocks on the external compiler only once the compiled object
- -- is truly needed.
- --
- build :: OpenAcc aenv a -> Gamma aenv -> CIO [AccKernel a]
- build acc aenv = do
- dev <- asks deviceProps
- mapM (build1 acc) (codegenAcc dev acc aenv)
- build1 :: OpenAcc aenv a -> CUTranslSkel aenv a -> CIO (AccKernel a)
- build1 acc code = do
- dev <- asks deviceProps
- table <- gets kernelTable
- (entry,key) <- compile table dev code
- let (cta,blocks,smem) = launchConfig acc dev occ
- (mdl,fun,occ) = unsafePerformIO $ do
- m <- link table key
- f <- CUDA.getFun m entry
- l <- CUDA.requires f CUDA.MaxKernelThreadsPerBlock
- o <- determineOccupancy acc dev f l
- D.when D.dump_cc (stats entry f o)
- return (m,f,o)
- --
- return $ AccKernel entry fun mdl occ cta smem blocks
- where
- stats name fn occ = do
- regs <- CUDA.requires fn CUDA.NumRegs
- smem <- CUDA.requires fn CUDA.SharedSizeBytes
- cmem <- CUDA.requires fn CUDA.ConstSizeBytes
- lmem <- CUDA.requires fn CUDA.LocalSizeBytes
- let msg1 = "entry function '" ++ name ++ "' used "
- ++ shows regs " registers, " ++ shows smem " bytes smem, "
- ++ shows lmem " bytes lmem, " ++ shows cmem " bytes cmem"
- msg2 = "multiprocessor occupancy " ++ showFFloat (Just 1) (CUDA.occupancy100 occ) "% : "
- ++ shows (CUDA.activeThreads occ) " threads over "
- ++ shows (CUDA.activeWarps occ) " warps in "
- ++ shows (CUDA.activeThreadBlocks occ) " blocks"
- --
- -- make sure kernel/stats are printed together. Use 'intercalate' rather
- -- than 'unlines' to avoid a trailing newline.
- --
- message $ intercalate "\n ... " [msg1, msg2]
- -- Link a compiled binary and update the associated kernel entry in the hash
- -- table. This may entail waiting for the external compilation process to
- -- complete. If successful, the temporary files are removed.
- --
- link :: KernelTable -> KernelKey -> IO CUDA.Module
- link table key =
- let intErr = INTERNAL_ERROR(error) "link" "missing kernel entry"
- in do
- ctx <- CUDA.get
- entry <- fromMaybe intErr `fmap` KT.lookup table key
- case entry of
- CompileProcess cufile done -> do
- -- Wait for the compiler to finish and load the binary object into the
- -- current context.
- --
- -- A forked thread will fill the MVar once the external compilation
- -- process completes, but only the main thread executes kernels. Hence,
- -- only one thread will ever attempt to take the MVar in order to link
- -- the binary object.
- --
- message "waiting for nvcc..."
- takeMVar done
- let cubin = replaceExtension cufile ".cubin"
- bin <- B.readFile cubin
- mdl <- CUDA.loadData bin
- -- Update hash tables and stash the binary object into the persistent
- -- cache
- --
- KT.insert table key $! KernelObject bin (FL.singleton ctx mdl)
- KT.persist cubin key
- -- Remove temporary build products.
- -- If compiling kernels with debugging symbols, leave the source files
- -- in place so that they can be referenced by 'cuda-gdb'.
- --
- D.unless D.debug_cc $ do
- removeFile cufile
- removeDirectory (dropFileName cufile)
- `catchIOError` \_ -> return () -- directory not empty
- return mdl
- -- If we get a real object back, then this will already be in the
- -- persistent cache, since either it was just read in from there, or we
- -- had to generate new code and the link step above has added it.
- --
- KernelObject bin active
- | Just mdl <- FL.lookup ctx active -> return mdl
- | otherwise -> do
- message "re-linking module for current context"
- mdl <- CUDA.loadData bin
- KT.insert table key $! KernelObject bin (FL.cons ctx mdl active)
- return mdl
- -- Generate and compile code for a single open array expression
- --
- compile :: KernelTable -> CUDA.DeviceProperties -> CUTranslSkel aenv a -> CIO (String, KernelKey)
- compile table dev cunit = do
- exists <- isJust `fmap` liftIO (KT.lookup table key)
- unless exists $ do
- message $ unlines [ show key, T.unpack code ]
- nvcc <- fromMaybe (error "nvcc: command not found") <$> liftIO (findExecutable "nvcc")
- (file,hdl) <- openTemporaryFile "dragon.cu" -- rawr!
- flags <- compileFlags file
- done <- liftIO $ do
- message $ "execute: " ++ nvcc ++ " " ++ unwords flags
- T.hPutStr hdl code `finally` hClose hdl
- enqueueProcess (proc nvcc flags) `onException` removeFile file
- --
- liftIO $ KT.insert table key (CompileProcess file done)
- --
- return (entry, key)
- where
- entry = show cunit
- key = (CUDA.computeCapability dev, hashlazy (T.encodeUtf8 code) )
- code = displayLazyText . renderCompact $ ppr cunit
- -- Determine the appropriate command line flags to pass to the compiler process.
- -- This is dependent on the host architecture and device capabilities.
- --
- compileFlags :: FilePath -> CIO [String]
- compileFlags cufile = do
- CUDA.Compute m n <- CUDA.computeCapability `fmap` asks deviceProps
- ddir <- liftIO getDataDir
- return $ filter (not . null) $
- [ "-I", ddir </> "cubits"
- , "-arch=sm_" ++ show m ++ show n
- , "-cubin"
- , "-o", cufile `replaceExtension` "cubin"
- , if D.mode D.dump_cc then "" else "--disable-warnings"
- , if D.mode D.debug_cc then "-G" else "-O3"
- , machine
- , cufile ]
- where
- #if SIZEOF_HSINT == 4
- machine = "-m32"
- #elif SIZEOF_HSINT == 8
- machine = "-m64"
- #else
- machine = case sizeOf (undefined :: Int) of
- 4 -> "-m32"
- 8 -> "-m64"
- #endif
- -- Open a unique file in the temporary directory used for compilation
- -- by-products. The directory will be created if it does not exist.
- --
- openTemporaryFile :: String -> CIO (FilePath, Handle)
- openTemporaryFile template = liftIO $ do
- pid <- getProcessID
- dir <- (</>) <$> getTemporaryDirectory <*> pure ("accelerate-cuda-" ++ show pid)
- createDirectoryIfMissing True dir
- openTempFile dir template
- #ifndef VERSION_unix
- getProcessID :: ProcessHandle -> IO ProcessId
- getProcessID = getProcessId
- #endif
- -- Worker pool
- -- -----------
- {-# NOINLINE pool #-}
- pool :: Q.MSem Int
- pool = unsafePerformIO $ Q.new =<< getNumProcessors
- -- Queue a system process to be executed and return an MVar flag that will be
- -- filled once the process completes. The task will only be launched once there
- -- is a worker available from the pool. This ensures we don't run out of process
- -- handles or flood the IO bus, degrading performance.
- --
- enqueueProcess :: CreateProcess -> IO (MVar ())
- enqueueProcess cp = do
- mvar <- newEmptyMVar
- _ <- forkIO $ do
- -- wait for a worker to become available
- Q.wait pool
- (_,_,_,pid) <- createProcess cp
- -- asynchronously notify the queue when the compiler has completed
- _ <- forkIO $ do finally (waitFor pid) (Q.signal pool)
- putMVar mvar () -- never executed if the compilation fails.
- return ()
- --
- return mvar
- -- Wait for a (compilation) process to finish
- --
- waitFor :: ProcessHandle -> IO ()
- waitFor pid = do
- status <- waitForProcess pid
- case status of
- ExitSuccess -> return ()
- ExitFailure c -> error $ "nvcc terminated abnormally (" ++ show c ++ ")"
- -- Debug
- -- -----
- {-# INLINE message #-}
- message :: MonadIO m => String -> m ()
- message msg = trace msg $ return ()
- {-# INLINE trace #-}
- trace :: MonadIO m => String -> m a -> m a
- trace msg next = D.message D.dump_cc ("cc: " ++ msg) >> next