PageRenderTime 76ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/alacritty/vendor/regex/src/re_set.rs

https://bitbucket.org/draffensperger/vendored-crates
Rust | 423 lines | 158 code | 31 blank | 234 comment | 8 complexity | 5c8e663f3fe36b963bc96ab3617840f2 MD5 | raw file
Possible License(s): 0BSD, CC-BY-SA-4.0, MIT, Unlicense, Apache-2.0, BSD-3-Clause, MPL-2.0-no-copyleft-exception
  1. // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
  2. // file at the top-level directory of this distribution and at
  3. // http://rust-lang.org/COPYRIGHT.
  4. //
  5. // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6. // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7. // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8. // option. This file may not be copied, modified, or distributed
  9. // except according to those terms.
  10. macro_rules! define_set {
  11. ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
  12. $(#[$doc_regexset_example:meta])* ) => {
  13. pub mod $name {
  14. use std::fmt;
  15. use std::iter;
  16. use std::slice;
  17. use std::vec;
  18. use error::Error;
  19. use exec::Exec;
  20. use re_builder::$builder_mod::RegexSetBuilder;
  21. use re_trait::RegularExpression;
  22. /// Match multiple (possibly overlapping) regular expressions in a single scan.
  23. ///
  24. /// A regex set corresponds to the union of two or more regular expressions.
  25. /// That is, a regex set will match text where at least one of its
  26. /// constituent regular expressions matches. A regex set as its formulated here
  27. /// provides a touch more power: it will also report *which* regular
  28. /// expressions in the set match. Indeed, this is the key difference between
  29. /// regex sets and a single `Regex` with many alternates, since only one
  30. /// alternate can match at a time.
  31. ///
  32. /// For example, consider regular expressions to match email addresses and
  33. /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
  34. /// regex set is constructed from those regexes, then searching the text
  35. /// `foo@example.com` will report both regexes as matching. Of course, one
  36. /// could accomplish this by compiling each regex on its own and doing two
  37. /// searches over the text. The key advantage of using a regex set is that it
  38. /// will report the matching regexes using a *single pass through the text*.
  39. /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
  40. /// router for a complex web application or a user agent matcher), then a regex
  41. /// set can realize huge performance gains.
  42. ///
  43. /// # Example
  44. ///
  45. /// This shows how the above two regexes (for matching email addresses and
  46. /// domains) might work:
  47. ///
  48. $(#[$doc_regexset_example])*
  49. ///
  50. /// Note that it would be possible to adapt the above example to using `Regex`
  51. /// with an expression like:
  52. ///
  53. /// ```ignore
  54. /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
  55. /// ```
  56. ///
  57. /// After a match, one could then inspect the capture groups to figure out
  58. /// which alternates matched. The problem is that it is hard to make this
  59. /// approach scale when there are many regexes since the overlap between each
  60. /// alternate isn't always obvious to reason about.
  61. ///
  62. /// # Limitations
  63. ///
  64. /// Regex sets are limited to answering the following two questions:
  65. ///
  66. /// 1. Does any regex in the set match?
  67. /// 2. If so, which regexes in the set match?
  68. ///
  69. /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
  70. /// since the matching engines can stop after the first match is found.
  71. ///
  72. /// Other features like finding the location of successive matches or their
  73. /// sub-captures aren't supported. If you need this functionality, the
  74. /// recommended approach is to compile each regex in the set independently and
  75. /// selectively match them based on which regexes in the set matched.
  76. ///
  77. /// # Performance
  78. ///
  79. /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
  80. /// search takes `O(mn)` time, where `m` is proportional to the size of the
  81. /// regex set and `n` is proportional to the length of the search text.
  82. #[derive(Clone)]
  83. pub struct RegexSet(Exec);
  84. impl RegexSet {
  85. /// Create a new regex set with the given regular expressions.
  86. ///
  87. /// This takes an iterator of `S`, where `S` is something that can produce
  88. /// a `&str`. If any of the strings in the iterator are not valid regular
  89. /// expressions, then an error is returned.
  90. ///
  91. /// # Example
  92. ///
  93. /// Create a new regex set from an iterator of strings:
  94. ///
  95. /// ```rust
  96. /// # use regex::RegexSet;
  97. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  98. /// assert!(set.is_match("foo"));
  99. /// ```
  100. pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
  101. where S: AsRef<str>, I: IntoIterator<Item=S> {
  102. RegexSetBuilder::new(exprs).build()
  103. }
  104. /// Returns true if and only if one of the regexes in this set matches
  105. /// the text given.
  106. ///
  107. /// This method should be preferred if you only need to test whether any
  108. /// of the regexes in the set should match, but don't care about *which*
  109. /// regexes matched. This is because the underlying matching engine will
  110. /// quit immediately after seeing the first match instead of continuing to
  111. /// find all matches.
  112. ///
  113. /// Note that as with searches using `Regex`, the expression is unanchored
  114. /// by default. That is, if the regex does not start with `^` or `\A`, or
  115. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  116. /// text.
  117. ///
  118. /// # Example
  119. ///
  120. /// Tests whether a set matches some text:
  121. ///
  122. /// ```rust
  123. /// # use regex::RegexSet;
  124. /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
  125. /// assert!(set.is_match("foo"));
  126. /// assert!(!set.is_match("☃"));
  127. /// ```
  128. pub fn is_match(&self, text: $text_ty) -> bool {
  129. self.is_match_at(text, 0)
  130. }
  131. /// Returns the same as is_match, but starts the search at the given
  132. /// offset.
  133. ///
  134. /// The significance of the starting point is that it takes the surrounding
  135. /// context into consideration. For example, the `\A` anchor can only
  136. /// match when `start == 0`.
  137. #[doc(hidden)]
  138. pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
  139. self.0.searcher().is_match_at($as_bytes(text), start)
  140. }
  141. /// Returns the set of regular expressions that match in the given text.
  142. ///
  143. /// The set returned contains the index of each regular expression that
  144. /// matches in the given text. The index is in correspondence with the
  145. /// order of regular expressions given to `RegexSet`'s constructor.
  146. ///
  147. /// The set can also be used to iterate over the matched indices.
  148. ///
  149. /// Note that as with searches using `Regex`, the expression is unanchored
  150. /// by default. That is, if the regex does not start with `^` or `\A`, or
  151. /// end with `$` or `\z`, then it is permitted to match anywhere in the
  152. /// text.
  153. ///
  154. /// # Example
  155. ///
  156. /// Tests which regular expressions match the given text:
  157. ///
  158. /// ```rust
  159. /// # use regex::RegexSet;
  160. /// let set = RegexSet::new(&[
  161. /// r"\w+",
  162. /// r"\d+",
  163. /// r"\pL+",
  164. /// r"foo",
  165. /// r"bar",
  166. /// r"barfoo",
  167. /// r"foobar",
  168. /// ]).unwrap();
  169. /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
  170. /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
  171. ///
  172. /// // You can also test whether a particular regex matched:
  173. /// let matches = set.matches("foobar");
  174. /// assert!(!matches.matched(5));
  175. /// assert!(matches.matched(6));
  176. /// ```
  177. pub fn matches(&self, text: $text_ty) -> SetMatches {
  178. let mut matches = vec![false; self.0.regex_strings().len()];
  179. let any = self.read_matches_at(&mut matches, text, 0);
  180. SetMatches {
  181. matched_any: any,
  182. matches: matches,
  183. }
  184. }
  185. /// Returns the same as matches, but starts the search at the given
  186. /// offset and stores the matches into the slice given.
  187. ///
  188. /// The significance of the starting point is that it takes the surrounding
  189. /// context into consideration. For example, the `\A` anchor can only
  190. /// match when `start == 0`.
  191. ///
  192. /// `matches` must have a length that is at least the number of regexes
  193. /// in this set.
  194. ///
  195. /// This method returns true if and only if at least one member of
  196. /// `matches` is true after executing the set against `text`.
  197. #[doc(hidden)]
  198. pub fn read_matches_at(
  199. &self,
  200. matches: &mut [bool],
  201. text: $text_ty,
  202. start: usize,
  203. ) -> bool {
  204. self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
  205. }
  206. /// Returns the total number of regular expressions in this set.
  207. pub fn len(&self) -> usize {
  208. self.0.regex_strings().len()
  209. }
  210. }
  211. /// A set of matches returned by a regex set.
  212. #[derive(Clone, Debug)]
  213. pub struct SetMatches {
  214. matched_any: bool,
  215. matches: Vec<bool>,
  216. }
  217. impl SetMatches {
  218. /// Whether this set contains any matches.
  219. pub fn matched_any(&self) -> bool {
  220. self.matched_any
  221. }
  222. /// Whether the regex at the given index matched.
  223. ///
  224. /// The index for a regex is determined by its insertion order upon the
  225. /// initial construction of a `RegexSet`, starting at `0`.
  226. ///
  227. /// # Panics
  228. ///
  229. /// If `regex_index` is greater than or equal to `self.len()`.
  230. pub fn matched(&self, regex_index: usize) -> bool {
  231. self.matches[regex_index]
  232. }
  233. /// The total number of regexes in the set that created these matches.
  234. pub fn len(&self) -> usize {
  235. self.matches.len()
  236. }
  237. /// Returns an iterator over indexes in the regex that matched.
  238. ///
  239. /// This will always produces matches in ascending order of index, where
  240. /// the index corresponds to the index of the regex that matched with
  241. /// respect to its position when initially building the set.
  242. pub fn iter(&self) -> SetMatchesIter {
  243. SetMatchesIter((&*self.matches).into_iter().enumerate())
  244. }
  245. }
  246. impl IntoIterator for SetMatches {
  247. type IntoIter = SetMatchesIntoIter;
  248. type Item = usize;
  249. fn into_iter(self) -> Self::IntoIter {
  250. SetMatchesIntoIter(self.matches.into_iter().enumerate())
  251. }
  252. }
  253. impl<'a> IntoIterator for &'a SetMatches {
  254. type IntoIter = SetMatchesIter<'a>;
  255. type Item = usize;
  256. fn into_iter(self) -> Self::IntoIter {
  257. self.iter()
  258. }
  259. }
  260. /// An owned iterator over the set of matches from a regex set.
  261. ///
  262. /// This will always produces matches in ascending order of index, where the
  263. /// index corresponds to the index of the regex that matched with respect to
  264. /// its position when initially building the set.
  265. pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
  266. impl Iterator for SetMatchesIntoIter {
  267. type Item = usize;
  268. fn next(&mut self) -> Option<usize> {
  269. loop {
  270. match self.0.next() {
  271. None => return None,
  272. Some((_, false)) => {}
  273. Some((i, true)) => return Some(i),
  274. }
  275. }
  276. }
  277. }
  278. impl DoubleEndedIterator for SetMatchesIntoIter {
  279. fn next_back(&mut self) -> Option<usize> {
  280. loop {
  281. match self.0.next_back() {
  282. None => return None,
  283. Some((_, false)) => {}
  284. Some((i, true)) => return Some(i),
  285. }
  286. }
  287. }
  288. }
  289. /// A borrowed iterator over the set of matches from a regex set.
  290. ///
  291. /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
  292. ///
  293. /// This will always produces matches in ascending order of index, where the
  294. /// index corresponds to the index of the regex that matched with respect to
  295. /// its position when initially building the set.
  296. #[derive(Clone)]
  297. pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
  298. impl<'a> Iterator for SetMatchesIter<'a> {
  299. type Item = usize;
  300. fn next(&mut self) -> Option<usize> {
  301. loop {
  302. match self.0.next() {
  303. None => return None,
  304. Some((_, &false)) => {}
  305. Some((i, &true)) => return Some(i),
  306. }
  307. }
  308. }
  309. }
  310. impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
  311. fn next_back(&mut self) -> Option<usize> {
  312. loop {
  313. match self.0.next_back() {
  314. None => return None,
  315. Some((_, &false)) => {}
  316. Some((i, &true)) => return Some(i),
  317. }
  318. }
  319. }
  320. }
  321. #[doc(hidden)]
  322. impl From<Exec> for RegexSet {
  323. fn from(exec: Exec) -> Self {
  324. RegexSet(exec)
  325. }
  326. }
  327. impl fmt::Debug for RegexSet {
  328. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  329. write!(f, "RegexSet({:?})", self.0.regex_strings())
  330. }
  331. }
  332. #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
  333. #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
  334. }
  335. }
  336. }
  337. define_set! {
  338. unicode,
  339. set_unicode,
  340. &str,
  341. as_bytes_str,
  342. /// ```rust
  343. /// # use regex::RegexSet;
  344. /// let set = RegexSet::new(&[
  345. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  346. /// r"[a-z]+\.(com|org|net)",
  347. /// ]).unwrap();
  348. ///
  349. /// // Ask whether any regexes in the set match.
  350. /// assert!(set.is_match("foo@example.com"));
  351. ///
  352. /// // Identify which regexes in the set match.
  353. /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
  354. /// assert_eq!(vec![0, 1], matches);
  355. ///
  356. /// // Try again, but with text that only matches one of the regexes.
  357. /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
  358. /// assert_eq!(vec![1], matches);
  359. ///
  360. /// // Try again, but with text that doesn't match any regex in the set.
  361. /// let matches: Vec<_> = set.matches("example").into_iter().collect();
  362. /// assert!(matches.is_empty());
  363. /// ```
  364. }
  365. define_set! {
  366. bytes,
  367. set_bytes,
  368. &[u8],
  369. as_bytes_bytes,
  370. /// ```rust
  371. /// # use regex::bytes::RegexSet;
  372. /// let set = RegexSet::new(&[
  373. /// r"[a-z]+@[a-z]+\.(com|org|net)",
  374. /// r"[a-z]+\.(com|org|net)",
  375. /// ]).unwrap();
  376. ///
  377. /// // Ask whether any regexes in the set match.
  378. /// assert!(set.is_match(b"foo@example.com"));
  379. ///
  380. /// // Identify which regexes in the set match.
  381. /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
  382. /// assert_eq!(vec![0, 1], matches);
  383. ///
  384. /// // Try again, but with text that only matches one of the regexes.
  385. /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
  386. /// assert_eq!(vec![1], matches);
  387. ///
  388. /// // Try again, but with text that doesn't match any regex in the set.
  389. /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
  390. /// assert!(matches.is_empty());
  391. /// ```
  392. }