/src/Control/Evo/Tests/Regex.hs
Haskell | 343 lines | 290 code | 42 blank | 11 comment | 26 complexity | ef4cfe1ae8bec078f1c5a497615129e4 MD5 | raw file
- {-# LANGUAGE LambdaCase, FlexibleContexts, OverloadedStrings, GADTs, TypeFamilies #-}
- {-# LANGUAGE RankNTypes, DataKinds, PartialTypeSignatures, DeriveGeneric, TypeOperators #-}
- module Control.Evo.Tests.Regex where
- import Interlude hiding (show, State)
- import Prelude (show, String)
- import qualified Data.Text as Text
- import System.Timeout
- import Control.Evo
- import Control.Evo.Distributed (distributedSpeciesEvolution, distributedSpeciesEvolutionClient)
- import Control.Effects.Reader
- import qualified Data.Map.Strict as Map
- import GHC.IO.Unsafe
- import Data.IORef
- import Text.Regex.PCRE.Heavy
- import Control.Effects.State
- -- import Control.Lens
- data RegEx =
- Failed
- | Empty
- | AnyChar
- | Literal Char
- | Concat RegEx RegEx
- | Alternate RegEx RegEx
- | Kleene RegEx
- | Optional RegEx
- | Range Char Char
- deriving (Eq, Ord, Read, Show, Generic)
- memo :: (Ord a, Show a) => IORef (Map a b) -> (a -> b) -> a -> b
- memo mv f a =unsafePerformIO $ do
- mp <- readIORef mv
- case Map.lookup a mp of
- Nothing -> do
- let b = f a
- writeIORef mv $! Map.insert a b mp
- return b
- Just b ->
- return b
- showRegex :: RegEx -> String
- showRegex Failed = "∅"
- showRegex Empty = "ε"
- showRegex AnyChar = "."
- showRegex (Literal c)
- | c `elem` (".[]()*{}\\+?|-^$=!&%/'\",;:_#<>" :: String) = '\\' : [c]
- | otherwise = [c]
- showRegex (Concat r1 r2) = showRegex r1 ++ showRegex r2
- showRegex (Alternate r1 r2) = "(" ++ showRegex r1 ++ "|" ++ showRegex r2 ++ ")"
- showRegex (Kleene (Literal c)) = showRegex (Literal c) ++ "*"
- showRegex (Kleene r) = "(" ++ showRegex r ++ ")*"
- showRegex (Optional r) = "(" ++ showRegex r ++ ")?"
- showRegex (Range c1 c2) = "[" ++ showRegex (Literal c1) ++ "-" ++ showRegex (Literal c2) ++ "]"
- instance Semigroup RegEx where (<>) = mappend
- instance Monoid RegEx where
- mempty = Empty
- mappend = Concat
- exactly :: String -> RegEx
- exactly = mconcat . map Literal
- inc :: MonadState Int m => m ()
- inc = modify' (+ 1)
- match :: RegEx -> String -> [String]
- match = memoMatch
- where
- memoMatch = match' -- curry (memo ref (uncurry match'))
- match' :: RegEx -> String -> [String]
- match' Failed _ = []
- match' Empty s = [s]
- match' AnyChar (_ : ss) = [ss]
- match' AnyChar [] = []
- match' (Literal c) (s : ss) | c == s = [ss]
- | otherwise = []
- match' (Literal _) "" = []
- match' (Concat r1 r2) s = [s'' | s' <- memoMatch r1 s, s'' <- memoMatch r2 s']
- match' (Alternate r1 r2) s = memoMatch r1 s ++ memoMatch r2 s
- match' (Kleene r) s =
- ordNub $ s : [s' | x <- memoMatch r s, x /= s, s' <- memoMatch (Kleene r) x]
- match' (Optional r) s = s : memoMatch r s
- match' (Range c1 c2) (c : ss) = [ss | c1 <= c && c2 >= c]
- match' (Range _ _) [] = []
- isMatch :: RegEx -> String -> Bool
- isMatch r s =
- any null $ match r s
- alphaNums :: String
- -- alphaNums = ['a'..'z'] <> ['A'..'Z'] <> ['0'..'9'] <> "\"#$%&/()=?*+'~-.,;:_"
- alphaNums = map chr [32..127]
- randomRegex :: MonadRandom m => Int -> m RegEx
- randomRegex 0 = getRandomR (1 :: Int, 10) >>= \case
- 1 -> return AnyChar
- 2 -> do
- x <- uniform alphaNums
- y <- uniform alphaNums
- return $ Range (min x y) (max x y)
- -- 3 -> return Empty
- -- 4 -> return Failed
- _ -> Literal <$> uniform alphaNums
- randomRegex n = getRandomR (1 :: Int, 4) >>= \case
- 1 -> Concat <$> randomRegex left <*> randomRegex right
- 2 -> Alternate <$> randomRegex left <*> randomRegex right
- 3 -> Kleene <$> randomRegex (n - 1)
- 4 -> Optional <$> randomRegex (n - 1)
- _ -> error "Can't happen"
- where left = (n - 1) `div` 2
- right = (n - 1) - left
- weighRegex :: RegEx -> TreeSize
- weighRegex Failed = TreeSize 1 []
- weighRegex Empty = TreeSize 1 []
- weighRegex AnyChar = TreeSize 1 []
- weighRegex (Literal _) = TreeSize 1 []
- weighRegex (Range _ _) = TreeSize 1 []
- weighRegex (Concat l r) = let tl@(TreeSize sl _) = weighRegex l
- tr@(TreeSize sr _) = weighRegex r
- in TreeSize (sl + sr + 1) [tl, tr]
- weighRegex (Alternate l r) = let tl@(TreeSize sl _) = weighRegex l
- tr@(TreeSize sr _) = weighRegex r
- in TreeSize (sl + sr + 1) [tl, tr]
- weighRegex (Kleene l) = let tl@(TreeSize sl _) = weighRegex l
- in TreeSize (sl + 1) [tl]
- weighRegex (Optional l) = let tl@(TreeSize sl _) = weighRegex l
- in TreeSize (sl + 1) [tl]
- data TreeSize = TreeSize Int [TreeSize] deriving (Eq, Ord, Read, Show)
- randomSubtree :: MonadRandom m => RegEx -> m (RegEx, RegEx -> RegEx)
- randomSubtree regex = withWeight regex (weighRegex regex)
- where withWeight Failed _ = return (Failed, identity)
- withWeight Empty _ = return (Empty, identity)
- withWeight AnyChar _ = return (AnyChar, identity)
- withWeight (Literal c) _ = return (Literal c, identity)
- withWeight (Range c1 c2) _ = return (Range c1 c2, identity)
- withWeight (Concat l r) (TreeSize cs [lts@(TreeSize ls _), rts@(TreeSize _ _)]) = do
- choice <- getRandomR (1, cs)
- if choice == 1 then return (Concat l r, identity)
- else if choice - 1 < ls then do
- (lsub, lfun) <- withWeight l lts
- return (lsub, \x -> Concat (lfun x) r)
- else do
- (rsub, rfun) <- withWeight r rts
- return (rsub, Concat l . rfun)
- withWeight (Alternate l r) (TreeSize cs [lts@(TreeSize ls _), rts@(TreeSize _ _)]) = do
- choice <- getRandomR (1, cs)
- if choice == 1 then return (Alternate l r, identity)
- else if choice - 1 < ls then do
- (lsub, lfun) <- withWeight l lts
- return (lsub, \x -> Alternate (lfun x) r)
- else do
- (rsub, rfun) <- withWeight r rts
- return (rsub, Alternate l . rfun)
- withWeight (Kleene s) (TreeSize cs [sts@(TreeSize _ _)]) = do
- choice <- getRandomR (1, cs)
- if choice == 1 then return (Kleene s, identity)
- else do
- (sub, fun) <- withWeight s sts
- return (sub, Kleene . fun)
- withWeight (Optional s) (TreeSize cs [sts@(TreeSize _ _)]) = do
- choice <- getRandomR (1, cs)
- if choice == 1 then return (Optional s, identity)
- else do
- (sub, fun) <- withWeight s sts
- return (sub, Optional . fun)
- withWeight l r = error $ "Bad pattern: " <> show l <> " " <> show r
- isLeaf :: RegEx -> Bool
- isLeaf r | TreeSize 1 _ <- weighRegex r = True
- isLeaf _ = False
- -- | A random leaf subexpression
- randomLeaf :: MonadRandom m => RegEx -> m (RegEx, RegEx -> RegEx)
- randomLeaf regex = do
- (sub, hole) <- randomSubtree regex
- if isLeaf sub then return (sub, hole)
- else do
- (leaf, holeInSub) <- randomLeaf sub
- return (leaf, hole . holeInSub)
- -- | Only teak leaf regexes (ones that don't have subexpressions)
- tweakLeaf :: MonadRandom m => RegEx -> m RegEx
- tweakLeaf Failed = randomRegex 0
- tweakLeaf Empty = randomRegex 0
- tweakLeaf AnyChar = randomRegex 0
- tweakLeaf (Literal _) = randomRegex 0
- tweakLeaf (Range _ _) = randomRegex 0
- tweakLeaf _ = error "tweakLeaf only takes leaf regexes"
- tweakRegex :: MonadRandom m => RegEx -> m RegEx
- tweakRegex regex = do
- (leaf, hole) <- randomLeaf regex
- hole <$> tweakLeaf leaf
- mutateRegex :: MonadRandom m => RegEx -> m RegEx
- mutateRegex regex = do
- choice <- getRandomR (1 :: Int, 10)
- if choice == 1 then
- crossRegex regex regex
- else if choice == 2 then
- tweakRegex regex
- else do
- (sub, fun) <- randomSubtree regex
- let TreeSize s _ = weighRegex sub
- fun <$> (randomRegex =<< getRandomR (max (s - 2) 0, s + 2))
- newRandomRegex :: MonadRandom m => m RegEx
- newRandomRegex = getRandomR (0, 30) >>= randomRegex
- crossRegex :: MonadRandom m => RegEx -> RegEx -> m RegEx
- crossRegex a b = do
- (sub, _) <- randomSubtree a
- (_, fun) <- randomSubtree b
- return (fun sub)
- validEmails :: [String]
- validEmails = ["email@example.com", "firstname.lastname@example.com", "email@subdomain.example.com", "firstname+lastname@example.com", "email@123.123.123.123", "email@[123.123.123.123]", "“email”@example.com", "1234567890@example.com", "email@example-one.com", "_______@example.com", "email@example.name", "email@example.museum", "email@example.co.jp", "firstname-lastname@example.com"]
- invalidEmails :: [String]
- invalidEmails = ["plainaddress", "#@%^%#$@#$@#.com", "@example.com", "Joe Smith <email@example.com>", "email.example.com", "email@example@example.com", ".email@example.com", "email.@example.com", "email..email@example.com", "あいうえお@example.com", "email@example.com (Joe Smith)", "email@example", "email@-example.com", "email@example.web", "email@111.222.333.44444", "email@example..com", "Abc..123@example.com"]
- rangeSum :: RegEx -> Int
- rangeSum Failed = 0
- rangeSum Empty = 0
- rangeSum AnyChar = 0
- rangeSum (Literal _) = 0
- rangeSum (Range a b) = ord b - ord a
- rangeSum (Concat l r) = rangeSum l + rangeSum r
- rangeSum (Alternate l r) = rangeSum l + rangeSum r
- rangeSum (Kleene l) = rangeSum l
- rangeSum (Optional l) = rangeSum l
- evaluateRegex :: (MonadRandom m, MonadIO m) => [String] -> [String] -> RegEx -> m (Int, Int, Int)
- evaluateRegex matchThese don'tMatchThese rgx = do
- let rgx' = "^" <> showRegex rgx <> "$"
- cmp = case compileM (toS rgx') [] of
- Right cmp' -> cmp'
- Left err -> error $ rgx' <> " ==> " <> show rgx <> " ==> " <> err
- cmp `seq` return ()
- maybeScore <- liftIO $ timeout 1000 $ do
- let posMatch = length $ filter (=~ cmp) matchThese
- negMatch = length $ filter (=~ cmp) don'tMatchThese
- validity <- evaluate $ posMatch - negMatch
- sizePenalty <- evaluate $ let TreeSize s _ = weighRegex rgx in -(max s 10)
- rangePenalty <- evaluate $ rangeSum rgx
- void $ evaluate sizePenalty
- -- let score = validity `seq` sizePenalty `seq`
- return (validity + min 0 (sizePenalty + 100), sizePenalty, -rangePenalty)
- case maybeScore of
- Nothing ->
- -- putStrLn $ "Timed out on " <> showRegex rgx
- return (-100, 0, 0)
- Just s -> return s
- printActivity :: MonadIO m => EvolutionActivity RegEx (Int, Int, Int) -> m ()
- printActivity (NewGenerationActivity gen) = do
- let (reg, score) = selectBest gen
- putStr (pshow score <> ": " <> toS (showRegex reg) <> "\n" :: Text)
- printActivity EnvironmentMutation = putText "Env mutation"
- printActivity _ = return ()
- loadWords :: FilePath -> IO [String]
- loadWords path = map toS . Text.lines <$> readFile path
- mutateWord :: MonadRandom m => Double -> String -> m String
- mutateWord strength word = do
- keeps <- replicateM (length word) ((>= strength) <$> getRandomR (0, 1))
- return $ map snd $ filter fst $ zip keeps word
- evoHandler :: _ => [String] -> [String] -> Evolution RegEx (Int, Int, Int) m
- evoHandler englishWords germanWords = EvolutionMethods newRandomRegex mutateRegex crossRegex eval mutateEnv
- where
- mutateEnv = do
- strength <- getRandomR (0, 0.5)
- eng <- mapM (mutateWord strength) englishWords
- ger <- mapM (mutateWord strength) germanWords
- setState (eng, ger)
- eval u = do
- (eng, ger) <- getState
- evaluateRegex eng ger u
- testWith ::
- RuntimeImplemented (Evolution RegEx (Int, Int, Int))
- (RuntimeImplemented (Log (EvolutionActivity RegEx (Int, Int, Int)))
- (RuntimeImplemented (ReadEnv GeneticConfiguration)
- (StateT ([String], [String])
- IO))) b
- -> IO b
- testWith evo = do
- english <- readFile "english.txt"
- let englishWords = map toS $ Text.lines english
- german <- readFile "german.txt"
- let germanWords = map toS $ Text.lines german
- evo
- & handleEvolution
- (evoHandler englishWords germanWords)
- (LogMethods printActivity)
- & implementReadEnv (return (GeneticConfiguration 10 20 30))
- & implementStateViaStateT (englishWords, germanWords)
- test :: IO ()
- test = testWith speciesEvolution
- testIterative :: IO ()
- testIterative = testWith iterativeEvolution
- testDistributed :: IO ()
- testDistributed = testWith distributedSpeciesEvolutionClient
- testDistributedServer :: IO ()
- testDistributedServer = distributedSpeciesEvolution (Proxy :: Proxy (RegEx, (Int, Int, Int)))
- oneOf :: String -> RegEx
- oneOf [c] = Literal c
- oneOf (c : cs) = Alternate (Literal c) (oneOf cs)
- oneOrMore :: RegEx -> RegEx
- oneOrMore r = Concat r (Kleene r)
- az09 :: RegEx
- az09 = Range 'a' 'z' `Alternate` Range '0' '9'
- -- ^[a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$
- charset :: RegEx
- charset = az09
- & Alternate (oneOf "!#$%&'*+\\/=?^_`{|}~-")
- emailRgx :: RegEx
- emailRgx =
- oneOrMore charset `Concat` Kleene (Literal '.' `Concat` oneOrMore charset)
- `Concat` (Literal '@')
- `Concat` oneOrMore (az09 `Concat` Optional (Kleene (az09 `Alternate` Literal '-') `Concat` az09) `Concat` Literal '.')
- `Concat` az09
- `Concat` Optional (Kleene (az09 `Alternate` Literal '-') `Concat` az09)
- bestOrganic :: RegEx
- bestOrganic = Concat (Concat (Optional (Concat (Literal '"') (Kleene AnyChar))) (Concat (Optional (Concat (Optional (Range 'P' 'p')) (Concat (Range 'P' 'p') (Concat AnyChar (Concat AnyChar (Optional (Concat (Concat AnyChar (Concat AnyChar (Range 'S' 'd'))) (Concat (Range '.' 'p') (Concat AnyChar (Concat AnyChar (Concat AnyChar (Concat (Kleene (Concat AnyChar AnyChar)) (Concat AnyChar (Concat AnyChar (Optional (Range 'P' 'p')))))))))))))))) (Concat (Alternate (Literal '1') (Range 'S' 'd')) (Kleene AnyChar)))) (Concat (Alternate (Literal '1') (Range 'P' 'p')) (Optional (Concat AnyChar (Range '.' 'p'))))