PageRenderTime 839ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/third_party/rust/regex-0.2.2/src/re_set.rs

https://bitbucket.org/vionika/spin.android
Rust | 411 lines | 158 code | 31 blank | 222 comment | 8 complexity | d5e927a3535e5ca21a64837e7cc2b9e7 MD5 | raw file
Possible License(s): JSON, 0BSD, AGPL-1.0, BSD-2-Clause, GPL-3.0, LGPL-2.1, LGPL-3.0, CC0-1.0, AGPL-3.0, MPL-2.0, Apache-2.0, MIT, BSD-3-Clause, MPL-2.0-no-copyleft-exception, GPL-2.0, Unlicense
  1. // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
  2. // file at the top-level directory of this distribution and at
  3. // http://rust-lang.org/COPYRIGHT.
  4. //
  5. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8. // option. This file may not be copied, modified, or distributed
  9. // except according to those terms.
  10. macro_rules! define_set {
  11. ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
  12. $(#[$doc_regexset_example:meta])* ) => {
  13. pub mod $name {
  14. use std::fmt;
  15. use std::iter;
  16. use std::slice;
  17. use std::vec;
  18. use error::Error;
  19. use exec::Exec;
  20. use re_builder::$builder_mod::RegexSetBuilder;
  21. use re_trait::RegularExpression;
  22. /// Match multiple (possibly overlapping) regular expressions in a single scan.
  23. ///
  24. /// A regex set corresponds to the union of two or more regular expressions.
  25. /// That is, a regex set will match text where at least one of its
  26. /// constituent regular expressions matches. A regex set as its formulated here
  27. /// provides a touch more power: it will also report *which* regular
  28. /// expressions in the set match. Indeed, this is the key difference between
  29. /// regex sets and a single `Regex` with many alternates, since only one
  30. /// alternate can match at a time.
  31. ///
  32. /// For example, consider regular expressions to match email addresses and
  33. /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
  34. /// regex set is constructed from those regexes, then searching the text
  35. /// `foo@example.com` will report both regexes as matching. Of course, one
  36. /// could accomplish this by compiling each regex on its own and doing two
  37. /// searches over the text. The key advantage of using a regex set is that it
  38. /// will report the matching regexes using a *single pass through the text*.
  39. /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
  40. /// router for a complex web application or a user agent matcher), then a regex
  41. /// set can realize huge performance gains.
  42. ///
  43. /// # Example
  44. ///
  45. /// This shows how the above two regexes (for matching email addresses and
  46. /// domains) might work:
  47. ///
  48. $(#[$doc_regexset_example])*
  49. ///
  50. /// Note that it would be possible to adapt the above example to using `Regex`
  51. /// with an expression like:
  52. ///
  53. /// ```ignore
  54. /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
  55. /// ```
  56. ///
  57. /// After a match, one could then inspect the capture groups to figure out
  58. /// which alternates matched. The problem is that it is hard to make this
  59. /// approach scale when there are many regexes since the overlap between each
  60. /// alternate isn't always obvious to reason about.
  61. ///
  62. /// # Limitations
  63. ///
  64. /// Regex sets are limited to answering the following two questions:
  65. ///
  66. /// 1. Does any regex in the set match?
  67. /// 2. If so, which regexes in the set match?
  68. ///
  69. /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
  70. /// since the matching engines can stop after the first match is found.
  71. ///
  72. /// Other features like finding the location of successive matches or their
  73. /// sub-captures aren't supported. If you need this functionality, the
  74. /// recommended approach is to compile each regex in the set independently and
  75. /// selectively match them based on which regexes in the set matched.
  76. ///
  77. /// # Performance
  78. ///
  79. /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
  80. /// search takes `O(mn)` time, where `m` is proportional to the size of the
  81. /// regex set and `n` is proportional to the length of the search text.
  82. #[derive(Clone)]
  83. pub struct RegexSet(Exec);
  84. impl RegexSet {
  85. /// Create a new regex set with the given regular expressions.
  86. ///
  87. /// This takes an iterator of `S`, where `S` is something that can produce
  88. /// a `&str`. If any of the strings in the iterator are not valid regular
  89. /// expressions, then an error is returned.
  90. ///
  91. /// # Example
  92. ///
  93. /// Create a new regex set from an iterator of strings:
  94. ///
  95. /// ```rust
  96. /// # use regex::RegexSet;
  97. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  98. /// assert!(set.is_match("foo"));
  99. /// ```
  100. pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
  101. where S: AsRef<str>, I: IntoIterator<Item=S> {
  102. RegexSetBuilder::new(exprs).build()
  103. }
  104. /// Returns true if and only if one of the regexes in this set matches
  105. /// the text given.
  106. ///
  107. /// This method should be preferred if you only need to test whether any
  108. /// of the regexes in the set should match, but don't care about *which*
  109. /// regexes matched. This is because the underlying matching engine will
  110. /// quit immediately after seeing the first match instead of continuing to
  111. /// find all matches.
  112. ///
  113. /// Note that as with searches using `Regex`, the expression is unanchored
  114. /// by default. That is, if the regex does not start with `^` or `\A`, or
  115. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  116. /// text.
  117. ///
  118. /// # Example
  119. ///
  120. /// Tests whether a set matches some text:
  121. ///
  122. /// ```rust
  123. /// # use regex::RegexSet;
  124. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  125. /// assert!(set.is_match("foo"));
  126. /// assert!(!set.is_match("☃"));
  127. /// ```
  128. pub fn is_match(&self, text: $text_ty) -> bool {
  129. self.is_match_at(text, 0)
  130. }
  131. /// Returns the same as is_match, but starts the search at the given
  132. /// offset.
  133. ///
  134. /// The significance of the starting point is that it takes the surrounding
  135. /// context into consideration. For example, the `\A` anchor can only
  136. /// match when `start == 0`.
  137. #[doc(hidden)]
  138. pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
  139. self.0.searcher().is_match_at($as_bytes(text), start)
  140. }
  141. /// Returns the set of regular expressions that match in the given text.
  142. ///
  143. /// The set returned contains the index of each regular expression that
  144. /// matches in the given text. The index is in correspondence with the
  145. /// order of regular expressions given to `RegexSet`'s constructor.
  146. ///
  147. /// The set can also be used to iterate over the matched indices.
  148. ///
  149. /// Note that as with searches using `Regex`, the expression is unanchored
  150. /// by default. That is, if the regex does not start with `^` or `\A`, or
  151. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  152. /// text.
  153. ///
  154. /// # Example
  155. ///
  156. /// Tests which regular expressions match the given text:
  157. ///
  158. /// ```rust
  159. /// # use regex::RegexSet;
  160. /// let set = RegexSet::new(&[
  161. /// r"\w+",
  162. /// r"\d+",
  163. /// r"\pL+",
  164. /// r"foo",
  165. /// r"bar",
  166. /// r"barfoo",
  167. /// r"foobar",
  168. /// ]).unwrap();
  169. /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
  170. /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
  171. ///
  172. /// // You can also test whether a particular regex matched:
  173. /// let matches = set.matches("foobar");
  174. /// assert!(!matches.matched(5));
  175. /// assert!(matches.matched(6));
  176. /// ```
  177. pub fn matches(&self, text: $text_ty) -> SetMatches {
  178. let mut matches = vec![false; self.0.regex_strings().len()];
  179. let any = self.read_matches_at(&mut matches, text, 0);
  180. SetMatches {
  181. matched_any: any,
  182. matches: matches,
  183. }
  184. }
  185. /// Returns the same as matches, but starts the search at the given
  186. /// offset and stores the matches into the slice given.
  187. ///
  188. /// The significance of the starting point is that it takes the surrounding
  189. /// context into consideration. For example, the `\A` anchor can only
  190. /// match when `start == 0`.
  191. ///
  192. /// `matches` must have a length that is at least the number of regexes
  193. /// in this set.
  194. ///
  195. /// This method returns true if and only if at least one member of
  196. /// `matches` is true after executing the set against `text`.
  197. #[doc(hidden)]
  198. pub fn read_matches_at(
  199. &self,
  200. matches: &mut [bool],
  201. text: $text_ty,
  202. start: usize,
  203. ) -> bool {
  204. self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
  205. }
  206. /// Returns the total number of regular expressions in this set.
  207. pub fn len(&self) -> usize {
  208. self.0.regex_strings().len()
  209. }
  210. }
  211. /// A set of matches returned by a regex set.
  212. #[derive(Clone, Debug)]
  213. pub struct SetMatches {
  214. matched_any: bool,
  215. matches: Vec<bool>,
  216. }
  217. impl SetMatches {
  218. /// Whether this set contains any matches.
  219. pub fn matched_any(&self) -> bool {
  220. self.matched_any
  221. }
  222. /// Whether the regex at the given index matched.
  223. ///
  224. /// The index for a regex is determined by its insertion order upon the
  225. /// initial construction of a `RegexSet`, starting at `0`.
  226. ///
  227. /// # Panics
  228. ///
  229. /// If `regex_index` is greater than or equal to `self.len()`.
  230. pub fn matched(&self, regex_index: usize) -> bool {
  231. self.matches[regex_index]
  232. }
  233. /// The total number of regexes in the set that created these matches.
  234. pub fn len(&self) -> usize {
  235. self.matches.len()
  236. }
  237. /// Returns an iterator over indexes in the regex that matched.
  238. pub fn iter(&self) -> SetMatchesIter {
  239. SetMatchesIter((&*self.matches).into_iter().enumerate())
  240. }
  241. }
  242. impl IntoIterator for SetMatches {
  243. type IntoIter = SetMatchesIntoIter;
  244. type Item = usize;
  245. fn into_iter(self) -> Self::IntoIter {
  246. SetMatchesIntoIter(self.matches.into_iter().enumerate())
  247. }
  248. }
  249. impl<'a> IntoIterator for &'a SetMatches {
  250. type IntoIter = SetMatchesIter<'a>;
  251. type Item = usize;
  252. fn into_iter(self) -> Self::IntoIter {
  253. self.iter()
  254. }
  255. }
  256. /// An owned iterator over the set of matches from a regex set.
  257. pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
  258. impl Iterator for SetMatchesIntoIter {
  259. type Item = usize;
  260. fn next(&mut self) -> Option<usize> {
  261. loop {
  262. match self.0.next() {
  263. None => return None,
  264. Some((_, false)) => {}
  265. Some((i, true)) => return Some(i),
  266. }
  267. }
  268. }
  269. }
  270. impl DoubleEndedIterator for SetMatchesIntoIter {
  271. fn next_back(&mut self) -> Option<usize> {
  272. loop {
  273. match self.0.next_back() {
  274. None => return None,
  275. Some((_, false)) => {}
  276. Some((i, true)) => return Some(i),
  277. }
  278. }
  279. }
  280. }
  281. /// A borrowed iterator over the set of matches from a regex set.
  282. ///
  283. /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
  284. #[derive(Clone)]
  285. pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
  286. impl<'a> Iterator for SetMatchesIter<'a> {
  287. type Item = usize;
  288. fn next(&mut self) -> Option<usize> {
  289. loop {
  290. match self.0.next() {
  291. None => return None,
  292. Some((_, &false)) => {}
  293. Some((i, &true)) => return Some(i),
  294. }
  295. }
  296. }
  297. }
  298. impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
  299. fn next_back(&mut self) -> Option<usize> {
  300. loop {
  301. match self.0.next_back() {
  302. None => return None,
  303. Some((_, &false)) => {}
  304. Some((i, &true)) => return Some(i),
  305. }
  306. }
  307. }
  308. }
  309. #[doc(hidden)]
  310. impl From<Exec> for RegexSet {
  311. fn from(exec: Exec) -> Self {
  312. RegexSet(exec)
  313. }
  314. }
  315. impl fmt::Debug for RegexSet {
  316. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  317. write!(f, "RegexSet({:?})", self.0.regex_strings())
  318. }
  319. }
  320. #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
  321. #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
  322. }
  323. }
  324. }
  325. define_set! {
  326. unicode,
  327. set_unicode,
  328. &str,
  329. as_bytes_str,
  330. /// ```rust
  331. /// # use regex::RegexSet;
  332. /// let set = RegexSet::new(&[
  333. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  334. /// r"[a-z]+\.(com|org|net)",
  335. /// ]).unwrap();
  336. ///
  337. /// // Ask whether any regexes in the set match.
  338. /// assert!(set.is_match("foo@example.com"));
  339. ///
  340. /// // Identify which regexes in the set match.
  341. /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
  342. /// assert_eq!(vec![0, 1], matches);
  343. ///
  344. /// // Try again, but with text that only matches one of the regexes.
  345. /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
  346. /// assert_eq!(vec![1], matches);
  347. ///
  348. /// // Try again, but with text that doesn't match any regex in the set.
  349. /// let matches: Vec<_> = set.matches("example").into_iter().collect();
  350. /// assert!(matches.is_empty());
  351. /// ```
  352. }
  353. define_set! {
  354. bytes,
  355. set_bytes,
  356. &[u8],
  357. as_bytes_bytes,
  358. /// ```rust
  359. /// # use regex::bytes::RegexSet;
  360. /// let set = RegexSet::new(&[
  361. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  362. /// r"[a-z]+\.(com|org|net)",
  363. /// ]).unwrap();
  364. ///
  365. /// // Ask whether any regexes in the set match.
  366. /// assert!(set.is_match(b"foo@example.com"));
  367. ///
  368. /// // Identify which regexes in the set match.
  369. /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
  370. /// assert_eq!(vec![0, 1], matches);
  371. ///
  372. /// // Try again, but with text that only matches one of the regexes.
  373. /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
  374. /// assert_eq!(vec![1], matches);
  375. ///
  376. /// // Try again, but with text that doesn't match any regex in the set.
  377. /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
  378. /// assert!(matches.is_empty());
  379. /// ```
  380. }