PageRenderTime 25ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/third_party/rust/regex/src/re_set.rs

https://bitbucket.org/vionika/spin.android
Rust | 431 lines | 164 code | 33 blank | 234 comment | 8 complexity | a8ea737b0bd095a38fda6185acc9eafc MD5 | raw file
Possible License(s): JSON, 0BSD, AGPL-1.0, BSD-2-Clause, GPL-3.0, LGPL-2.1, LGPL-3.0, CC0-1.0, AGPL-3.0, MPL-2.0, Apache-2.0, MIT, BSD-3-Clause, MPL-2.0-no-copyleft-exception, GPL-2.0, Unlicense
  1. // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
  2. // file at the top-level directory of this distribution and at
  3. // http://rust-lang.org/COPYRIGHT.
  4. //
  5. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8. // option. This file may not be copied, modified, or distributed
  9. // except according to those terms.
  10. macro_rules! define_set {
  11. ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
  12. $(#[$doc_regexset_example:meta])* ) => {
  13. pub mod $name {
  14. use std::fmt;
  15. use std::iter;
  16. use std::slice;
  17. use std::vec;
  18. use error::Error;
  19. use exec::Exec;
  20. use re_builder::$builder_mod::RegexSetBuilder;
  21. use re_trait::RegularExpression;
  22. /// Match multiple (possibly overlapping) regular expressions in a single scan.
  23. ///
  24. /// A regex set corresponds to the union of two or more regular expressions.
  25. /// That is, a regex set will match text where at least one of its
  26. /// constituent regular expressions matches. A regex set as its formulated here
  27. /// provides a touch more power: it will also report *which* regular
  28. /// expressions in the set match. Indeed, this is the key difference between
  29. /// regex sets and a single `Regex` with many alternates, since only one
  30. /// alternate can match at a time.
  31. ///
  32. /// For example, consider regular expressions to match email addresses and
  33. /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
  34. /// regex set is constructed from those regexes, then searching the text
  35. /// `foo@example.com` will report both regexes as matching. Of course, one
  36. /// could accomplish this by compiling each regex on its own and doing two
  37. /// searches over the text. The key advantage of using a regex set is that it
  38. /// will report the matching regexes using a *single pass through the text*.
  39. /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
  40. /// router for a complex web application or a user agent matcher), then a regex
  41. /// set can realize huge performance gains.
  42. ///
  43. /// # Example
  44. ///
  45. /// This shows how the above two regexes (for matching email addresses and
  46. /// domains) might work:
  47. ///
  48. $(#[$doc_regexset_example])*
  49. ///
  50. /// Note that it would be possible to adapt the above example to using `Regex`
  51. /// with an expression like:
  52. ///
  53. /// ```ignore
  54. /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
  55. /// ```
  56. ///
  57. /// After a match, one could then inspect the capture groups to figure out
  58. /// which alternates matched. The problem is that it is hard to make this
  59. /// approach scale when there are many regexes since the overlap between each
  60. /// alternate isn't always obvious to reason about.
  61. ///
  62. /// # Limitations
  63. ///
  64. /// Regex sets are limited to answering the following two questions:
  65. ///
  66. /// 1. Does any regex in the set match?
  67. /// 2. If so, which regexes in the set match?
  68. ///
  69. /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
  70. /// since the matching engines can stop after the first match is found.
  71. ///
  72. /// Other features like finding the location of successive matches or their
  73. /// sub-captures aren't supported. If you need this functionality, the
  74. /// recommended approach is to compile each regex in the set independently and
  75. /// selectively match them based on which regexes in the set matched.
  76. ///
  77. /// # Performance
  78. ///
  79. /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
  80. /// search takes `O(mn)` time, where `m` is proportional to the size of the
  81. /// regex set and `n` is proportional to the length of the search text.
  82. #[derive(Clone)]
  83. pub struct RegexSet(Exec);
  84. impl RegexSet {
  85. /// Create a new regex set with the given regular expressions.
  86. ///
  87. /// This takes an iterator of `S`, where `S` is something that can produce
  88. /// a `&str`. If any of the strings in the iterator are not valid regular
  89. /// expressions, then an error is returned.
  90. ///
  91. /// # Example
  92. ///
  93. /// Create a new regex set from an iterator of strings:
  94. ///
  95. /// ```rust
  96. /// # use regex::RegexSet;
  97. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  98. /// assert!(set.is_match("foo"));
  99. /// ```
  100. pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
  101. where S: AsRef<str>, I: IntoIterator<Item=S> {
  102. RegexSetBuilder::new(exprs).build()
  103. }
  104. /// Returns true if and only if one of the regexes in this set matches
  105. /// the text given.
  106. ///
  107. /// This method should be preferred if you only need to test whether any
  108. /// of the regexes in the set should match, but don't care about *which*
  109. /// regexes matched. This is because the underlying matching engine will
  110. /// quit immediately after seeing the first match instead of continuing to
  111. /// find all matches.
  112. ///
  113. /// Note that as with searches using `Regex`, the expression is unanchored
  114. /// by default. That is, if the regex does not start with `^` or `\A`, or
  115. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  116. /// text.
  117. ///
  118. /// # Example
  119. ///
  120. /// Tests whether a set matches some text:
  121. ///
  122. /// ```rust
  123. /// # use regex::RegexSet;
  124. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  125. /// assert!(set.is_match("foo"));
  126. /// assert!(!set.is_match("☃"));
  127. /// ```
  128. pub fn is_match(&self, text: $text_ty) -> bool {
  129. self.is_match_at(text, 0)
  130. }
  131. /// Returns the same as is_match, but starts the search at the given
  132. /// offset.
  133. ///
  134. /// The significance of the starting point is that it takes the surrounding
  135. /// context into consideration. For example, the `\A` anchor can only
  136. /// match when `start == 0`.
  137. #[doc(hidden)]
  138. pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
  139. self.0.searcher().is_match_at($as_bytes(text), start)
  140. }
  141. /// Returns the set of regular expressions that match in the given text.
  142. ///
  143. /// The set returned contains the index of each regular expression that
  144. /// matches in the given text. The index is in correspondence with the
  145. /// order of regular expressions given to `RegexSet`'s constructor.
  146. ///
  147. /// The set can also be used to iterate over the matched indices.
  148. ///
  149. /// Note that as with searches using `Regex`, the expression is unanchored
  150. /// by default. That is, if the regex does not start with `^` or `\A`, or
  151. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  152. /// text.
  153. ///
  154. /// # Example
  155. ///
  156. /// Tests which regular expressions match the given text:
  157. ///
  158. /// ```rust
  159. /// # use regex::RegexSet;
  160. /// let set = RegexSet::new(&[
  161. /// r"\w+",
  162. /// r"\d+",
  163. /// r"\pL+",
  164. /// r"foo",
  165. /// r"bar",
  166. /// r"barfoo",
  167. /// r"foobar",
  168. /// ]).unwrap();
  169. /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
  170. /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
  171. ///
  172. /// // You can also test whether a particular regex matched:
  173. /// let matches = set.matches("foobar");
  174. /// assert!(!matches.matched(5));
  175. /// assert!(matches.matched(6));
  176. /// ```
  177. pub fn matches(&self, text: $text_ty) -> SetMatches {
  178. let mut matches = vec![false; self.0.regex_strings().len()];
  179. let any = self.read_matches_at(&mut matches, text, 0);
  180. SetMatches {
  181. matched_any: any,
  182. matches: matches,
  183. }
  184. }
  185. /// Returns the same as matches, but starts the search at the given
  186. /// offset and stores the matches into the slice given.
  187. ///
  188. /// The significance of the starting point is that it takes the surrounding
  189. /// context into consideration. For example, the `\A` anchor can only
  190. /// match when `start == 0`.
  191. ///
  192. /// `matches` must have a length that is at least the number of regexes
  193. /// in this set.
  194. ///
  195. /// This method returns true if and only if at least one member of
  196. /// `matches` is true after executing the set against `text`.
  197. #[doc(hidden)]
  198. pub fn read_matches_at(
  199. &self,
  200. matches: &mut [bool],
  201. text: $text_ty,
  202. start: usize,
  203. ) -> bool {
  204. self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
  205. }
  206. /// Returns the total number of regular expressions in this set.
  207. pub fn len(&self) -> usize {
  208. self.0.regex_strings().len()
  209. }
  210. }
  211. /// A set of matches returned by a regex set.
  212. #[derive(Clone, Debug)]
  213. pub struct SetMatches {
  214. matched_any: bool,
  215. matches: Vec<bool>,
  216. }
  217. impl SetMatches {
  218. /// Whether this set contains any matches.
  219. pub fn matched_any(&self) -> bool {
  220. self.matched_any
  221. }
  222. /// Whether the regex at the given index matched.
  223. ///
  224. /// The index for a regex is determined by its insertion order upon the
  225. /// initial construction of a `RegexSet`, starting at `0`.
  226. ///
  227. /// # Panics
  228. ///
  229. /// If `regex_index` is greater than or equal to `self.len()`.
  230. pub fn matched(&self, regex_index: usize) -> bool {
  231. self.matches[regex_index]
  232. }
  233. /// The total number of regexes in the set that created these matches.
  234. pub fn len(&self) -> usize {
  235. self.matches.len()
  236. }
  237. /// Returns an iterator over indexes in the regex that matched.
  238. ///
  239. /// This will always produces matches in ascending order of index, where
  240. /// the index corresponds to the index of the regex that matched with
  241. /// respect to its position when initially building the set.
  242. pub fn iter(&self) -> SetMatchesIter {
  243. SetMatchesIter((&*self.matches).into_iter().enumerate())
  244. }
  245. }
  246. impl IntoIterator for SetMatches {
  247. type IntoIter = SetMatchesIntoIter;
  248. type Item = usize;
  249. fn into_iter(self) -> Self::IntoIter {
  250. SetMatchesIntoIter(self.matches.into_iter().enumerate())
  251. }
  252. }
  253. impl<'a> IntoIterator for &'a SetMatches {
  254. type IntoIter = SetMatchesIter<'a>;
  255. type Item = usize;
  256. fn into_iter(self) -> Self::IntoIter {
  257. self.iter()
  258. }
  259. }
  260. /// An owned iterator over the set of matches from a regex set.
  261. ///
  262. /// This will always produces matches in ascending order of index, where the
  263. /// index corresponds to the index of the regex that matched with respect to
  264. /// its position when initially building the set.
  265. pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
  266. impl Iterator for SetMatchesIntoIter {
  267. type Item = usize;
  268. fn next(&mut self) -> Option<usize> {
  269. loop {
  270. match self.0.next() {
  271. None => return None,
  272. Some((_, false)) => {}
  273. Some((i, true)) => return Some(i),
  274. }
  275. }
  276. }
  277. fn size_hint(&self) -> (usize, Option<usize>) {
  278. self.0.size_hint()
  279. }
  280. }
  281. impl DoubleEndedIterator for SetMatchesIntoIter {
  282. fn next_back(&mut self) -> Option<usize> {
  283. loop {
  284. match self.0.next_back() {
  285. None => return None,
  286. Some((_, false)) => {}
  287. Some((i, true)) => return Some(i),
  288. }
  289. }
  290. }
  291. }
  292. /// A borrowed iterator over the set of matches from a regex set.
  293. ///
  294. /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
  295. ///
  296. /// This will always produces matches in ascending order of index, where the
  297. /// index corresponds to the index of the regex that matched with respect to
  298. /// its position when initially building the set.
  299. #[derive(Clone)]
  300. pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
  301. impl<'a> Iterator for SetMatchesIter<'a> {
  302. type Item = usize;
  303. fn next(&mut self) -> Option<usize> {
  304. loop {
  305. match self.0.next() {
  306. None => return None,
  307. Some((_, &false)) => {}
  308. Some((i, &true)) => return Some(i),
  309. }
  310. }
  311. }
  312. fn size_hint(&self) -> (usize, Option<usize>) {
  313. self.0.size_hint()
  314. }
  315. }
  316. impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
  317. fn next_back(&mut self) -> Option<usize> {
  318. loop {
  319. match self.0.next_back() {
  320. None => return None,
  321. Some((_, &false)) => {}
  322. Some((i, &true)) => return Some(i),
  323. }
  324. }
  325. }
  326. }
  327. #[doc(hidden)]
  328. impl From<Exec> for RegexSet {
  329. fn from(exec: Exec) -> Self {
  330. RegexSet(exec)
  331. }
  332. }
  333. impl fmt::Debug for RegexSet {
  334. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  335. write!(f, "RegexSet({:?})", self.0.regex_strings())
  336. }
  337. }
  338. #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
  339. #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
  340. }
  341. }
  342. }
  343. define_set! {
  344. unicode,
  345. set_unicode,
  346. &str,
  347. as_bytes_str,
  348. /// ```rust
  349. /// # use regex::RegexSet;
  350. /// let set = RegexSet::new(&[
  351. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  352. /// r"[a-z]+\.(com|org|net)",
  353. /// ]).unwrap();
  354. ///
  355. /// // Ask whether any regexes in the set match.
  356. /// assert!(set.is_match("foo@example.com"));
  357. ///
  358. /// // Identify which regexes in the set match.
  359. /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
  360. /// assert_eq!(vec![0, 1], matches);
  361. ///
  362. /// // Try again, but with text that only matches one of the regexes.
  363. /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
  364. /// assert_eq!(vec![1], matches);
  365. ///
  366. /// // Try again, but with text that doesn't match any regex in the set.
  367. /// let matches: Vec<_> = set.matches("example").into_iter().collect();
  368. /// assert!(matches.is_empty());
  369. /// ```
  370. }
  371. define_set! {
  372. bytes,
  373. set_bytes,
  374. &[u8],
  375. as_bytes_bytes,
  376. /// ```rust
  377. /// # use regex::bytes::RegexSet;
  378. /// let set = RegexSet::new(&[
  379. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  380. /// r"[a-z]+\.(com|org|net)",
  381. /// ]).unwrap();
  382. ///
  383. /// // Ask whether any regexes in the set match.
  384. /// assert!(set.is_match(b"foo@example.com"));
  385. ///
  386. /// // Identify which regexes in the set match.
  387. /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
  388. /// assert_eq!(vec![0, 1], matches);
  389. ///
  390. /// // Try again, but with text that only matches one of the regexes.
  391. /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
  392. /// assert_eq!(vec![1], matches);
  393. ///
  394. /// // Try again, but with text that doesn't match any regex in the set.
  395. /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
  396. /// assert!(matches.is_empty());
  397. /// ```
  398. }