PageRenderTime 27ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/third_party/rust/regex/v1/crate/src/re_set.rs

https://github.com/chromium/chromium
Rust | 475 lines | 176 code | 37 blank | 262 comment | 10 complexity | bff14d00cdf8447b61984281806437cc MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0, BSD-3-Clause
  1. macro_rules! define_set {
  2. ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
  3. $(#[$doc_regexset_example:meta])* ) => {
  4. pub mod $name {
  5. use std::fmt;
  6. use std::iter;
  7. use std::slice;
  8. use std::vec;
  9. use crate::error::Error;
  10. use crate::exec::Exec;
  11. use crate::re_builder::$builder_mod::RegexSetBuilder;
  12. use crate::re_trait::RegularExpression;
  13. /// Match multiple (possibly overlapping) regular expressions in a single scan.
  14. ///
  15. /// A regex set corresponds to the union of two or more regular expressions.
  16. /// That is, a regex set will match text where at least one of its
  17. /// constituent regular expressions matches. A regex set as its formulated here
  18. /// provides a touch more power: it will also report *which* regular
  19. /// expressions in the set match. Indeed, this is the key difference between
  20. /// regex sets and a single `Regex` with many alternates, since only one
  21. /// alternate can match at a time.
  22. ///
  23. /// For example, consider regular expressions to match email addresses and
  24. /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
  25. /// regex set is constructed from those regexes, then searching the text
  26. /// `foo@example.com` will report both regexes as matching. Of course, one
  27. /// could accomplish this by compiling each regex on its own and doing two
  28. /// searches over the text. The key advantage of using a regex set is that it
  29. /// will report the matching regexes using a *single pass through the text*.
  30. /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
  31. /// router for a complex web application or a user agent matcher), then a regex
  32. /// set can realize huge performance gains.
  33. ///
  34. /// # Example
  35. ///
  36. /// This shows how the above two regexes (for matching email addresses and
  37. /// domains) might work:
  38. ///
  39. $(#[$doc_regexset_example])*
  40. ///
  41. /// Note that it would be possible to adapt the above example to using `Regex`
  42. /// with an expression like:
  43. ///
  44. /// ```text
  45. /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
  46. /// ```
  47. ///
  48. /// After a match, one could then inspect the capture groups to figure out
  49. /// which alternates matched. The problem is that it is hard to make this
  50. /// approach scale when there are many regexes since the overlap between each
  51. /// alternate isn't always obvious to reason about.
  52. ///
  53. /// # Limitations
  54. ///
  55. /// Regex sets are limited to answering the following two questions:
  56. ///
  57. /// 1. Does any regex in the set match?
  58. /// 2. If so, which regexes in the set match?
  59. ///
  60. /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
  61. /// since the matching engines can stop after the first match is found.
  62. ///
  63. /// Other features like finding the location of successive matches or their
  64. /// sub-captures aren't supported. If you need this functionality, the
  65. /// recommended approach is to compile each regex in the set independently and
  66. /// selectively match them based on which regexes in the set matched.
  67. ///
  68. /// # Performance
  69. ///
  70. /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
  71. /// search takes `O(mn)` time, where `m` is proportional to the size of the
  72. /// regex set and `n` is proportional to the length of the search text.
  73. #[derive(Clone)]
  74. pub struct RegexSet(Exec);
  75. impl RegexSet {
  76. /// Create a new regex set with the given regular expressions.
  77. ///
  78. /// This takes an iterator of `S`, where `S` is something that can produce
  79. /// a `&str`. If any of the strings in the iterator are not valid regular
  80. /// expressions, then an error is returned.
  81. ///
  82. /// # Example
  83. ///
  84. /// Create a new regex set from an iterator of strings:
  85. ///
  86. /// ```rust
  87. /// # use regex::RegexSet;
  88. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  89. /// assert!(set.is_match("foo"));
  90. /// ```
  91. pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
  92. where S: AsRef<str>, I: IntoIterator<Item=S> {
  93. RegexSetBuilder::new(exprs).build()
  94. }
  95. /// Create a new empty regex set.
  96. ///
  97. /// # Example
  98. ///
  99. /// ```rust
  100. /// # use regex::RegexSet;
  101. /// let set = RegexSet::empty();
  102. /// assert!(set.is_empty());
  103. /// ```
  104. pub fn empty() -> RegexSet {
  105. RegexSetBuilder::new(&[""; 0]).build().unwrap()
  106. }
  107. /// Returns true if and only if one of the regexes in this set matches
  108. /// the text given.
  109. ///
  110. /// This method should be preferred if you only need to test whether any
  111. /// of the regexes in the set should match, but don't care about *which*
  112. /// regexes matched. This is because the underlying matching engine will
  113. /// quit immediately after seeing the first match instead of continuing to
  114. /// find all matches.
  115. ///
  116. /// Note that as with searches using `Regex`, the expression is unanchored
  117. /// by default. That is, if the regex does not start with `^` or `\A`, or
  118. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  119. /// text.
  120. ///
  121. /// # Example
  122. ///
  123. /// Tests whether a set matches some text:
  124. ///
  125. /// ```rust
  126. /// # use regex::RegexSet;
  127. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  128. /// assert!(set.is_match("foo"));
  129. /// assert!(!set.is_match("☃"));
  130. /// ```
  131. pub fn is_match(&self, text: $text_ty) -> bool {
  132. self.is_match_at(text, 0)
  133. }
  134. /// Returns the same as is_match, but starts the search at the given
  135. /// offset.
  136. ///
  137. /// The significance of the starting point is that it takes the surrounding
  138. /// context into consideration. For example, the `\A` anchor can only
  139. /// match when `start == 0`.
  140. #[doc(hidden)]
  141. pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
  142. self.0.searcher().is_match_at($as_bytes(text), start)
  143. }
  144. /// Returns the set of regular expressions that match in the given text.
  145. ///
  146. /// The set returned contains the index of each regular expression that
  147. /// matches in the given text. The index is in correspondence with the
  148. /// order of regular expressions given to `RegexSet`'s constructor.
  149. ///
  150. /// The set can also be used to iterate over the matched indices.
  151. ///
  152. /// Note that as with searches using `Regex`, the expression is unanchored
  153. /// by default. That is, if the regex does not start with `^` or `\A`, or
  154. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  155. /// text.
  156. ///
  157. /// # Example
  158. ///
  159. /// Tests which regular expressions match the given text:
  160. ///
  161. /// ```rust
  162. /// # use regex::RegexSet;
  163. /// let set = RegexSet::new(&[
  164. /// r"\w+",
  165. /// r"\d+",
  166. /// r"\pL+",
  167. /// r"foo",
  168. /// r"bar",
  169. /// r"barfoo",
  170. /// r"foobar",
  171. /// ]).unwrap();
  172. /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
  173. /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
  174. ///
  175. /// // You can also test whether a particular regex matched:
  176. /// let matches = set.matches("foobar");
  177. /// assert!(!matches.matched(5));
  178. /// assert!(matches.matched(6));
  179. /// ```
  180. pub fn matches(&self, text: $text_ty) -> SetMatches {
  181. let mut matches = vec![false; self.0.regex_strings().len()];
  182. let any = self.read_matches_at(&mut matches, text, 0);
  183. SetMatches {
  184. matched_any: any,
  185. matches: matches,
  186. }
  187. }
  188. /// Returns the same as matches, but starts the search at the given
  189. /// offset and stores the matches into the slice given.
  190. ///
  191. /// The significance of the starting point is that it takes the surrounding
  192. /// context into consideration. For example, the `\A` anchor can only
  193. /// match when `start == 0`.
  194. ///
  195. /// `matches` must have a length that is at least the number of regexes
  196. /// in this set.
  197. ///
  198. /// This method returns true if and only if at least one member of
  199. /// `matches` is true after executing the set against `text`.
  200. #[doc(hidden)]
  201. pub fn read_matches_at(
  202. &self,
  203. matches: &mut [bool],
  204. text: $text_ty,
  205. start: usize,
  206. ) -> bool {
  207. self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
  208. }
  209. /// Returns the total number of regular expressions in this set.
  210. pub fn len(&self) -> usize {
  211. self.0.regex_strings().len()
  212. }
  213. /// Returns `true` if this set contains no regular expressions.
  214. pub fn is_empty(&self) -> bool {
  215. self.0.regex_strings().is_empty()
  216. }
  217. /// Returns the patterns that this set will match on.
  218. ///
  219. /// This function can be used to determine the pattern for a match. The
  220. /// slice returned has exactly as many patterns givens to this regex set,
  221. /// and the order of the slice is the same as the order of the patterns
  222. /// provided to the set.
  223. ///
  224. /// # Example
  225. ///
  226. /// ```rust
  227. /// # use regex::RegexSet;
  228. /// let set = RegexSet::new(&[
  229. /// r"\w+",
  230. /// r"\d+",
  231. /// r"\pL+",
  232. /// r"foo",
  233. /// r"bar",
  234. /// r"barfoo",
  235. /// r"foobar",
  236. /// ]).unwrap();
  237. /// let matches: Vec<_> = set
  238. /// .matches("foobar")
  239. /// .into_iter()
  240. /// .map(|match_idx| &set.patterns()[match_idx])
  241. /// .collect();
  242. /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
  243. /// ```
  244. pub fn patterns(&self) -> &[String] {
  245. self.0.regex_strings()
  246. }
  247. }
  248. /// A set of matches returned by a regex set.
  249. #[derive(Clone, Debug)]
  250. pub struct SetMatches {
  251. matched_any: bool,
  252. matches: Vec<bool>,
  253. }
  254. impl SetMatches {
  255. /// Whether this set contains any matches.
  256. pub fn matched_any(&self) -> bool {
  257. self.matched_any
  258. }
  259. /// Whether the regex at the given index matched.
  260. ///
  261. /// The index for a regex is determined by its insertion order upon the
  262. /// initial construction of a `RegexSet`, starting at `0`.
  263. ///
  264. /// # Panics
  265. ///
  266. /// If `regex_index` is greater than or equal to `self.len()`.
  267. pub fn matched(&self, regex_index: usize) -> bool {
  268. self.matches[regex_index]
  269. }
  270. /// The total number of regexes in the set that created these matches.
  271. pub fn len(&self) -> usize {
  272. self.matches.len()
  273. }
  274. /// Returns an iterator over indexes in the regex that matched.
  275. ///
  276. /// This will always produces matches in ascending order of index, where
  277. /// the index corresponds to the index of the regex that matched with
  278. /// respect to its position when initially building the set.
  279. pub fn iter(&self) -> SetMatchesIter<'_> {
  280. SetMatchesIter((&*self.matches).into_iter().enumerate())
  281. }
  282. }
  283. impl IntoIterator for SetMatches {
  284. type IntoIter = SetMatchesIntoIter;
  285. type Item = usize;
  286. fn into_iter(self) -> Self::IntoIter {
  287. SetMatchesIntoIter(self.matches.into_iter().enumerate())
  288. }
  289. }
  290. impl<'a> IntoIterator for &'a SetMatches {
  291. type IntoIter = SetMatchesIter<'a>;
  292. type Item = usize;
  293. fn into_iter(self) -> Self::IntoIter {
  294. self.iter()
  295. }
  296. }
  297. /// An owned iterator over the set of matches from a regex set.
  298. ///
  299. /// This will always produces matches in ascending order of index, where the
  300. /// index corresponds to the index of the regex that matched with respect to
  301. /// its position when initially building the set.
  302. #[derive(Debug)]
  303. pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
  304. impl Iterator for SetMatchesIntoIter {
  305. type Item = usize;
  306. fn next(&mut self) -> Option<usize> {
  307. loop {
  308. match self.0.next() {
  309. None => return None,
  310. Some((_, false)) => {}
  311. Some((i, true)) => return Some(i),
  312. }
  313. }
  314. }
  315. fn size_hint(&self) -> (usize, Option<usize>) {
  316. self.0.size_hint()
  317. }
  318. }
  319. impl DoubleEndedIterator for SetMatchesIntoIter {
  320. fn next_back(&mut self) -> Option<usize> {
  321. loop {
  322. match self.0.next_back() {
  323. None => return None,
  324. Some((_, false)) => {}
  325. Some((i, true)) => return Some(i),
  326. }
  327. }
  328. }
  329. }
  330. impl iter::FusedIterator for SetMatchesIntoIter {}
  331. /// A borrowed iterator over the set of matches from a regex set.
  332. ///
  333. /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
  334. ///
  335. /// This will always produces matches in ascending order of index, where the
  336. /// index corresponds to the index of the regex that matched with respect to
  337. /// its position when initially building the set.
  338. #[derive(Clone, Debug)]
  339. pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
  340. impl<'a> Iterator for SetMatchesIter<'a> {
  341. type Item = usize;
  342. fn next(&mut self) -> Option<usize> {
  343. loop {
  344. match self.0.next() {
  345. None => return None,
  346. Some((_, &false)) => {}
  347. Some((i, &true)) => return Some(i),
  348. }
  349. }
  350. }
  351. fn size_hint(&self) -> (usize, Option<usize>) {
  352. self.0.size_hint()
  353. }
  354. }
  355. impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
  356. fn next_back(&mut self) -> Option<usize> {
  357. loop {
  358. match self.0.next_back() {
  359. None => return None,
  360. Some((_, &false)) => {}
  361. Some((i, &true)) => return Some(i),
  362. }
  363. }
  364. }
  365. }
  366. impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
  367. #[doc(hidden)]
  368. impl From<Exec> for RegexSet {
  369. fn from(exec: Exec) -> Self {
  370. RegexSet(exec)
  371. }
  372. }
  373. impl fmt::Debug for RegexSet {
  374. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  375. write!(f, "RegexSet({:?})", self.0.regex_strings())
  376. }
  377. }
  378. #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
  379. #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
  380. }
  381. }
  382. }
  383. define_set! {
  384. unicode,
  385. set_unicode,
  386. &str,
  387. as_bytes_str,
  388. /// ```rust
  389. /// # use regex::RegexSet;
  390. /// let set = RegexSet::new(&[
  391. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  392. /// r"[a-z]+\.(com|org|net)",
  393. /// ]).unwrap();
  394. ///
  395. /// // Ask whether any regexes in the set match.
  396. /// assert!(set.is_match("foo@example.com"));
  397. ///
  398. /// // Identify which regexes in the set match.
  399. /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
  400. /// assert_eq!(vec![0, 1], matches);
  401. ///
  402. /// // Try again, but with text that only matches one of the regexes.
  403. /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
  404. /// assert_eq!(vec![1], matches);
  405. ///
  406. /// // Try again, but with text that doesn't match any regex in the set.
  407. /// let matches: Vec<_> = set.matches("example").into_iter().collect();
  408. /// assert!(matches.is_empty());
  409. /// ```
  410. }
  411. define_set! {
  412. bytes,
  413. set_bytes,
  414. &[u8],
  415. as_bytes_bytes,
  416. /// ```rust
  417. /// # use regex::bytes::RegexSet;
  418. /// let set = RegexSet::new(&[
  419. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  420. /// r"[a-z]+\.(com|org|net)",
  421. /// ]).unwrap();
  422. ///
  423. /// // Ask whether any regexes in the set match.
  424. /// assert!(set.is_match(b"foo@example.com"));
  425. ///
  426. /// // Identify which regexes in the set match.
  427. /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
  428. /// assert_eq!(vec![0, 1], matches);
  429. ///
  430. /// // Try again, but with text that only matches one of the regexes.
  431. /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
  432. /// assert_eq!(vec![1], matches);
  433. ///
  434. /// // Try again, but with text that doesn't match any regex in the set.
  435. /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
  436. /// assert!(matches.is_empty());
  437. /// ```
  438. }