PageRenderTime 3883ms CodeModel.GetById 34ms RepoModel.GetById 8ms app.codeStats 1ms

/src/Control/Evo/Tests/Regex.hs

https://gitlab.com/LukaHorvat/evo
Haskell | 343 lines | 290 code | 42 blank | 11 comment | 26 complexity | ef4cfe1ae8bec078f1c5a497615129e4 MD5 | raw file
  1. {-# LANGUAGE LambdaCase, FlexibleContexts, OverloadedStrings, GADTs, TypeFamilies #-}
  2. {-# LANGUAGE RankNTypes, DataKinds, PartialTypeSignatures, DeriveGeneric, TypeOperators #-}
  3. module Control.Evo.Tests.Regex where
  4. import Interlude hiding (show, State)
  5. import Prelude (show, String)
  6. import qualified Data.Text as Text
  7. import System.Timeout
  8. import Control.Evo
  9. import Control.Evo.Distributed (distributedSpeciesEvolution, distributedSpeciesEvolutionClient)
  10. import Control.Effects.Reader
  11. import qualified Data.Map.Strict as Map
  12. import GHC.IO.Unsafe
  13. import Data.IORef
  14. import Text.Regex.PCRE.Heavy
  15. import Control.Effects.State
  16. -- import Control.Lens
  17. data RegEx =
  18. Failed
  19. | Empty
  20. | AnyChar
  21. | Literal Char
  22. | Concat RegEx RegEx
  23. | Alternate RegEx RegEx
  24. | Kleene RegEx
  25. | Optional RegEx
  26. | Range Char Char
  27. deriving (Eq, Ord, Read, Show, Generic)
  28. memo :: (Ord a, Show a) => IORef (Map a b) -> (a -> b) -> a -> b
  29. memo mv f a =unsafePerformIO $ do
  30. mp <- readIORef mv
  31. case Map.lookup a mp of
  32. Nothing -> do
  33. let b = f a
  34. writeIORef mv $! Map.insert a b mp
  35. return b
  36. Just b ->
  37. return b
  38. showRegex :: RegEx -> String
  39. showRegex Failed = "∅"
  40. showRegex Empty = "ε"
  41. showRegex AnyChar = "."
  42. showRegex (Literal c)
  43. | c `elem` (".[]()*{}\\+?|-^$=!&%/'\",;:_#<>" :: String) = '\\' : [c]
  44. | otherwise = [c]
  45. showRegex (Concat r1 r2) = showRegex r1 ++ showRegex r2
  46. showRegex (Alternate r1 r2) = "(" ++ showRegex r1 ++ "|" ++ showRegex r2 ++ ")"
  47. showRegex (Kleene (Literal c)) = showRegex (Literal c) ++ "*"
  48. showRegex (Kleene r) = "(" ++ showRegex r ++ ")*"
  49. showRegex (Optional r) = "(" ++ showRegex r ++ ")?"
  50. showRegex (Range c1 c2) = "[" ++ showRegex (Literal c1) ++ "-" ++ showRegex (Literal c2) ++ "]"
  51. instance Semigroup RegEx where (<>) = mappend
  52. instance Monoid RegEx where
  53. mempty = Empty
  54. mappend = Concat
  55. exactly :: String -> RegEx
  56. exactly = mconcat . map Literal
  57. inc :: MonadState Int m => m ()
  58. inc = modify' (+ 1)
  59. match :: RegEx -> String -> [String]
  60. match = memoMatch
  61. where
  62. memoMatch = match' -- curry (memo ref (uncurry match'))
  63. match' :: RegEx -> String -> [String]
  64. match' Failed _ = []
  65. match' Empty s = [s]
  66. match' AnyChar (_ : ss) = [ss]
  67. match' AnyChar [] = []
  68. match' (Literal c) (s : ss) | c == s = [ss]
  69. | otherwise = []
  70. match' (Literal _) "" = []
  71. match' (Concat r1 r2) s = [s'' | s' <- memoMatch r1 s, s'' <- memoMatch r2 s']
  72. match' (Alternate r1 r2) s = memoMatch r1 s ++ memoMatch r2 s
  73. match' (Kleene r) s =
  74. ordNub $ s : [s' | x <- memoMatch r s, x /= s, s' <- memoMatch (Kleene r) x]
  75. match' (Optional r) s = s : memoMatch r s
  76. match' (Range c1 c2) (c : ss) = [ss | c1 <= c && c2 >= c]
  77. match' (Range _ _) [] = []
  78. isMatch :: RegEx -> String -> Bool
  79. isMatch r s =
  80. any null $ match r s
  81. alphaNums :: String
  82. -- alphaNums = ['a'..'z'] <> ['A'..'Z'] <> ['0'..'9'] <> "\"#$%&/()=?*+'~-.,;:_"
  83. alphaNums = map chr [32..127]
  84. randomRegex :: MonadRandom m => Int -> m RegEx
  85. randomRegex 0 = getRandomR (1 :: Int, 10) >>= \case
  86. 1 -> return AnyChar
  87. 2 -> do
  88. x <- uniform alphaNums
  89. y <- uniform alphaNums
  90. return $ Range (min x y) (max x y)
  91. -- 3 -> return Empty
  92. -- 4 -> return Failed
  93. _ -> Literal <$> uniform alphaNums
  94. randomRegex n = getRandomR (1 :: Int, 4) >>= \case
  95. 1 -> Concat <$> randomRegex left <*> randomRegex right
  96. 2 -> Alternate <$> randomRegex left <*> randomRegex right
  97. 3 -> Kleene <$> randomRegex (n - 1)
  98. 4 -> Optional <$> randomRegex (n - 1)
  99. _ -> error "Can't happen"
  100. where left = (n - 1) `div` 2
  101. right = (n - 1) - left
  102. weighRegex :: RegEx -> TreeSize
  103. weighRegex Failed = TreeSize 1 []
  104. weighRegex Empty = TreeSize 1 []
  105. weighRegex AnyChar = TreeSize 1 []
  106. weighRegex (Literal _) = TreeSize 1 []
  107. weighRegex (Range _ _) = TreeSize 1 []
  108. weighRegex (Concat l r) = let tl@(TreeSize sl _) = weighRegex l
  109. tr@(TreeSize sr _) = weighRegex r
  110. in TreeSize (sl + sr + 1) [tl, tr]
  111. weighRegex (Alternate l r) = let tl@(TreeSize sl _) = weighRegex l
  112. tr@(TreeSize sr _) = weighRegex r
  113. in TreeSize (sl + sr + 1) [tl, tr]
  114. weighRegex (Kleene l) = let tl@(TreeSize sl _) = weighRegex l
  115. in TreeSize (sl + 1) [tl]
  116. weighRegex (Optional l) = let tl@(TreeSize sl _) = weighRegex l
  117. in TreeSize (sl + 1) [tl]
  118. data TreeSize = TreeSize Int [TreeSize] deriving (Eq, Ord, Read, Show)
  119. randomSubtree :: MonadRandom m => RegEx -> m (RegEx, RegEx -> RegEx)
  120. randomSubtree regex = withWeight regex (weighRegex regex)
  121. where withWeight Failed _ = return (Failed, identity)
  122. withWeight Empty _ = return (Empty, identity)
  123. withWeight AnyChar _ = return (AnyChar, identity)
  124. withWeight (Literal c) _ = return (Literal c, identity)
  125. withWeight (Range c1 c2) _ = return (Range c1 c2, identity)
  126. withWeight (Concat l r) (TreeSize cs [lts@(TreeSize ls _), rts@(TreeSize _ _)]) = do
  127. choice <- getRandomR (1, cs)
  128. if choice == 1 then return (Concat l r, identity)
  129. else if choice - 1 < ls then do
  130. (lsub, lfun) <- withWeight l lts
  131. return (lsub, \x -> Concat (lfun x) r)
  132. else do
  133. (rsub, rfun) <- withWeight r rts
  134. return (rsub, Concat l . rfun)
  135. withWeight (Alternate l r) (TreeSize cs [lts@(TreeSize ls _), rts@(TreeSize _ _)]) = do
  136. choice <- getRandomR (1, cs)
  137. if choice == 1 then return (Alternate l r, identity)
  138. else if choice - 1 < ls then do
  139. (lsub, lfun) <- withWeight l lts
  140. return (lsub, \x -> Alternate (lfun x) r)
  141. else do
  142. (rsub, rfun) <- withWeight r rts
  143. return (rsub, Alternate l . rfun)
  144. withWeight (Kleene s) (TreeSize cs [sts@(TreeSize _ _)]) = do
  145. choice <- getRandomR (1, cs)
  146. if choice == 1 then return (Kleene s, identity)
  147. else do
  148. (sub, fun) <- withWeight s sts
  149. return (sub, Kleene . fun)
  150. withWeight (Optional s) (TreeSize cs [sts@(TreeSize _ _)]) = do
  151. choice <- getRandomR (1, cs)
  152. if choice == 1 then return (Optional s, identity)
  153. else do
  154. (sub, fun) <- withWeight s sts
  155. return (sub, Optional . fun)
  156. withWeight l r = error $ "Bad pattern: " <> show l <> " " <> show r
  157. isLeaf :: RegEx -> Bool
  158. isLeaf r | TreeSize 1 _ <- weighRegex r = True
  159. isLeaf _ = False
  160. -- | A random leaf subexpression
  161. randomLeaf :: MonadRandom m => RegEx -> m (RegEx, RegEx -> RegEx)
  162. randomLeaf regex = do
  163. (sub, hole) <- randomSubtree regex
  164. if isLeaf sub then return (sub, hole)
  165. else do
  166. (leaf, holeInSub) <- randomLeaf sub
  167. return (leaf, hole . holeInSub)
  168. -- | Only teak leaf regexes (ones that don't have subexpressions)
  169. tweakLeaf :: MonadRandom m => RegEx -> m RegEx
  170. tweakLeaf Failed = randomRegex 0
  171. tweakLeaf Empty = randomRegex 0
  172. tweakLeaf AnyChar = randomRegex 0
  173. tweakLeaf (Literal _) = randomRegex 0
  174. tweakLeaf (Range _ _) = randomRegex 0
  175. tweakLeaf _ = error "tweakLeaf only takes leaf regexes"
  176. tweakRegex :: MonadRandom m => RegEx -> m RegEx
  177. tweakRegex regex = do
  178. (leaf, hole) <- randomLeaf regex
  179. hole <$> tweakLeaf leaf
  180. mutateRegex :: MonadRandom m => RegEx -> m RegEx
  181. mutateRegex regex = do
  182. choice <- getRandomR (1 :: Int, 10)
  183. if choice == 1 then
  184. crossRegex regex regex
  185. else if choice == 2 then
  186. tweakRegex regex
  187. else do
  188. (sub, fun) <- randomSubtree regex
  189. let TreeSize s _ = weighRegex sub
  190. fun <$> (randomRegex =<< getRandomR (max (s - 2) 0, s + 2))
  191. newRandomRegex :: MonadRandom m => m RegEx
  192. newRandomRegex = getRandomR (0, 30) >>= randomRegex
  193. crossRegex :: MonadRandom m => RegEx -> RegEx -> m RegEx
  194. crossRegex a b = do
  195. (sub, _) <- randomSubtree a
  196. (_, fun) <- randomSubtree b
  197. return (fun sub)
  198. validEmails :: [String]
  199. validEmails = ["email@example.com", "firstname.lastname@example.com", "email@subdomain.example.com", "firstname+lastname@example.com", "email@123.123.123.123", "email@[123.123.123.123]", "“email”@example.com", "1234567890@example.com", "email@example-one.com", "_______@example.com", "email@example.name", "email@example.museum", "email@example.co.jp", "firstname-lastname@example.com"]
  200. invalidEmails :: [String]
  201. invalidEmails = ["plainaddress", "#@%^%#$@#$@#.com", "@example.com", "Joe Smith <email@example.com>", "email.example.com", "email@example@example.com", ".email@example.com", "email.@example.com", "email..email@example.com", "あいうえお@example.com", "email@example.com (Joe Smith)", "email@example", "email@-example.com", "email@example.web", "email@111.222.333.44444", "email@example..com", "Abc..123@example.com"]
  202. rangeSum :: RegEx -> Int
  203. rangeSum Failed = 0
  204. rangeSum Empty = 0
  205. rangeSum AnyChar = 0
  206. rangeSum (Literal _) = 0
  207. rangeSum (Range a b) = ord b - ord a
  208. rangeSum (Concat l r) = rangeSum l + rangeSum r
  209. rangeSum (Alternate l r) = rangeSum l + rangeSum r
  210. rangeSum (Kleene l) = rangeSum l
  211. rangeSum (Optional l) = rangeSum l
  212. evaluateRegex :: (MonadRandom m, MonadIO m) => [String] -> [String] -> RegEx -> m (Int, Int, Int)
  213. evaluateRegex matchThese don'tMatchThese rgx = do
  214. let rgx' = "^" <> showRegex rgx <> "$"
  215. cmp = case compileM (toS rgx') [] of
  216. Right cmp' -> cmp'
  217. Left err -> error $ rgx' <> " ==> " <> show rgx <> " ==> " <> err
  218. cmp `seq` return ()
  219. maybeScore <- liftIO $ timeout 1000 $ do
  220. let posMatch = length $ filter (=~ cmp) matchThese
  221. negMatch = length $ filter (=~ cmp) don'tMatchThese
  222. validity <- evaluate $ posMatch - negMatch
  223. sizePenalty <- evaluate $ let TreeSize s _ = weighRegex rgx in -(max s 10)
  224. rangePenalty <- evaluate $ rangeSum rgx
  225. void $ evaluate sizePenalty
  226. -- let score = validity `seq` sizePenalty `seq`
  227. return (validity + min 0 (sizePenalty + 100), sizePenalty, -rangePenalty)
  228. case maybeScore of
  229. Nothing ->
  230. -- putStrLn $ "Timed out on " <> showRegex rgx
  231. return (-100, 0, 0)
  232. Just s -> return s
  233. printActivity :: MonadIO m => EvolutionActivity RegEx (Int, Int, Int) -> m ()
  234. printActivity (NewGenerationActivity gen) = do
  235. let (reg, score) = selectBest gen
  236. putStr (pshow score <> ": " <> toS (showRegex reg) <> "\n" :: Text)
  237. printActivity EnvironmentMutation = putText "Env mutation"
  238. printActivity _ = return ()
  239. loadWords :: FilePath -> IO [String]
  240. loadWords path = map toS . Text.lines <$> readFile path
  241. mutateWord :: MonadRandom m => Double -> String -> m String
  242. mutateWord strength word = do
  243. keeps <- replicateM (length word) ((>= strength) <$> getRandomR (0, 1))
  244. return $ map snd $ filter fst $ zip keeps word
  245. evoHandler :: _ => [String] -> [String] -> Evolution RegEx (Int, Int, Int) m
  246. evoHandler englishWords germanWords = EvolutionMethods newRandomRegex mutateRegex crossRegex eval mutateEnv
  247. where
  248. mutateEnv = do
  249. strength <- getRandomR (0, 0.5)
  250. eng <- mapM (mutateWord strength) englishWords
  251. ger <- mapM (mutateWord strength) germanWords
  252. setState (eng, ger)
  253. eval u = do
  254. (eng, ger) <- getState
  255. evaluateRegex eng ger u
  256. testWith ::
  257. RuntimeImplemented (Evolution RegEx (Int, Int, Int))
  258. (RuntimeImplemented (Log (EvolutionActivity RegEx (Int, Int, Int)))
  259. (RuntimeImplemented (ReadEnv GeneticConfiguration)
  260. (StateT ([String], [String])
  261. IO))) b
  262. -> IO b
  263. testWith evo = do
  264. english <- readFile "english.txt"
  265. let englishWords = map toS $ Text.lines english
  266. german <- readFile "german.txt"
  267. let germanWords = map toS $ Text.lines german
  268. evo
  269. & handleEvolution
  270. (evoHandler englishWords germanWords)
  271. (LogMethods printActivity)
  272. & implementReadEnv (return (GeneticConfiguration 10 20 30))
  273. & implementStateViaStateT (englishWords, germanWords)
  274. test :: IO ()
  275. test = testWith speciesEvolution
  276. testIterative :: IO ()
  277. testIterative = testWith iterativeEvolution
  278. testDistributed :: IO ()
  279. testDistributed = testWith distributedSpeciesEvolutionClient
  280. testDistributedServer :: IO ()
  281. testDistributedServer = distributedSpeciesEvolution (Proxy :: Proxy (RegEx, (Int, Int, Int)))
  282. oneOf :: String -> RegEx
  283. oneOf [c] = Literal c
  284. oneOf (c : cs) = Alternate (Literal c) (oneOf cs)
  285. oneOrMore :: RegEx -> RegEx
  286. oneOrMore r = Concat r (Kleene r)
  287. az09 :: RegEx
  288. az09 = Range 'a' 'z' `Alternate` Range '0' '9'
  289. -- ^[a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$
  290. charset :: RegEx
  291. charset = az09
  292. & Alternate (oneOf "!#$%&'*+\\/=?^_`{|}~-")
  293. emailRgx :: RegEx
  294. emailRgx =
  295. oneOrMore charset `Concat` Kleene (Literal '.' `Concat` oneOrMore charset)
  296. `Concat` (Literal '@')
  297. `Concat` oneOrMore (az09 `Concat` Optional (Kleene (az09 `Alternate` Literal '-') `Concat` az09) `Concat` Literal '.')
  298. `Concat` az09
  299. `Concat` Optional (Kleene (az09 `Alternate` Literal '-') `Concat` az09)
  300. bestOrganic :: RegEx
  301. bestOrganic = Concat (Concat (Optional (Concat (Literal '"') (Kleene AnyChar))) (Concat (Optional (Concat (Optional (Range 'P' 'p')) (Concat (Range 'P' 'p') (Concat AnyChar (Concat AnyChar (Optional (Concat (Concat AnyChar (Concat AnyChar (Range 'S' 'd'))) (Concat (Range '.' 'p') (Concat AnyChar (Concat AnyChar (Concat AnyChar (Concat (Kleene (Concat AnyChar AnyChar)) (Concat AnyChar (Concat AnyChar (Optional (Range 'P' 'p')))))))))))))))) (Concat (Alternate (Literal '1') (Range 'S' 'd')) (Kleene AnyChar)))) (Concat (Alternate (Literal '1') (Range 'P' 'p')) (Optional (Concat AnyChar (Range '.' 'p'))))