PageRenderTime 1042ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/DDG/Request.pm

https://gitlab.com/lanodan/duckduckgo
Perl | 388 lines | 329 code | 47 blank | 12 comment | 8 complexity | 0ee9b990292028daf659765321afd087 MD5 | raw file
Possible License(s): Apache-2.0
  1. package DDG::Request;
  2. # ABSTRACT: A request to DuckDuckGo itself, so the query itself and parameter around the query defining him
  3. use Moo;
  4. use utf8;
  5. use List::MoreUtils qw{ uniq };
  6. =head1 SYNOPSIS
  7. my $req = DDG::Request->new( query_raw => "Peter PAUL AND MARY!" );
  8. print $req->query_clean; # "peter paul and mary"
  9. =head1 DESCRIPTION
  10. This is the main request class which reflects a query and all parameter that
  11. are relevant for plugins to work with the request. It does not reflect a web
  12. request itself to DuckDuckGo, for this we have internal classes. The request
  13. class is the abstracted level all services can independently work with, on any
  14. medium, so also on the L<API|http://duckduckgo.com/api.html>, or via console
  15. based tests without web environment. This class is also base for run on a
  16. L<DDG::Block>.
  17. Beside the information of the query itself, a L<DDG::Request> can also contain
  18. the language, the region and the geo location (which is calculated out of the
  19. IP).
  20. =cut
  21. #
  22. # QUERY
  23. #
  24. ###############################
  25. =attr query_raw
  26. This is the only required attribute. It is the query in the most raw form. If
  27. the query is given over special ways (like coming out of a hard url like
  28. L<https://duckduckgo.com/Star_Trek_Voyager>), then those most get converted to
  29. the text that is normally shown on the query line then, before given to
  30. L</query_raw>.
  31. =cut
  32. has query_raw => (
  33. is => 'ro',
  34. required => 1,
  35. );
  36. my $whitespaces = qr{\s+};
  37. my $whitespaces_matches = qr{($whitespaces)};
  38. my $whitespaces_dashes = qr{[\s\-]+};
  39. my $non_alphanumeric_ascii = qr{[\x00-\x1f\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x81\x{a7}]+};
  40. =attr query_raw_parts
  41. This attribute gets generated out of the L</query_raw>, which gets split into
  42. all whitespace and non-whitespace content. For example the query:
  43. DDG::Request->new( query_raw => "A++ B++" );
  44. would give you the following arrayref on L</query_raw_parts>:
  45. [
  46. 'A++',
  47. ' ',
  48. 'B++',
  49. ]
  50. It preserves the exactly content of the query also the current amount of
  51. whitespaces. Always the even index positions of the arrayref is the non
  52. whitespace content. So if you have the query:
  53. DDG::Request->new( query_raw => " A++ B++ " );
  54. leads to this L</query_raw_parts> to fulfill this:
  55. [
  56. '',
  57. ' '
  58. 'A++',
  59. ' ',
  60. 'B++',
  61. ' ',
  62. ]
  63. =cut
  64. has query_raw_parts => (
  65. is => 'ro',
  66. lazy => 1,
  67. builder => '_build_query_raw_parts',
  68. );
  69. sub _build_query_raw_parts {
  70. [
  71. split(/$whitespaces_matches/,shift->query_raw)
  72. ]
  73. }
  74. =attr query_parts
  75. This functions filters out the whitespace parts and empty parts of
  76. L</query_raw_parts>. Also it cuts down all part which would exceed making the
  77. query more then 100 non whitespace characters.
  78. =cut
  79. has query_parts => (
  80. is => 'ro',
  81. lazy => 1,
  82. builder => '_build_query_parts',
  83. );
  84. sub _build_query_parts {
  85. my $x;
  86. [
  87. grep { ( $x += length ) < 100 }
  88. grep { ! /$whitespaces/ }
  89. grep { length }
  90. @{shift->query_raw_parts}
  91. ]
  92. }
  93. =attr query_parts_lc
  94. This takes the arrayref of L</query_parts> and makes a lowercase arrayref
  95. version of it.
  96. =cut
  97. has query_parts_lc => (
  98. is => 'ro',
  99. lazy => 1,
  100. builder => '_build_query_parts_lc',
  101. );
  102. sub _build_query_parts_lc {
  103. [
  104. map { lc }
  105. @{shift->query_parts}
  106. ]
  107. }
  108. =attr triggers
  109. Triggers generate a hashref construction which makes it very easy to parse a
  110. query very effective through the accessing it word by word and so just
  111. analyzing against as less combinations as possible.
  112. It uses L</query_raw_parts> for this, but ignores the whitespace parts. Then it
  113. passes every part through L</generate_triggers> which gives back all possible
  114. variants of the specific given part.
  115. =cut
  116. has triggers => (
  117. is => 'ro',
  118. lazy => 1,
  119. builder => '_build_triggers',
  120. );
  121. sub _build_triggers {
  122. my ( $self ) = @_;
  123. my @parts = @{$self->query_raw_parts};
  124. return {} if not scalar @parts;
  125. my $x = $parts[0] eq '' ? 2 : 0;
  126. my %triggers;
  127. for ($x..(scalar @parts-1)) {
  128. unless ($_ % 2) {
  129. $triggers{$_} = [$self->generate_triggers($parts[$_])];
  130. }
  131. }
  132. return \%triggers;
  133. }
  134. =method generate_triggers
  135. This function takes a part of L</query_raw_parts> and generates all possible
  136. variants of it, also doing some magic with dash given words to give both
  137. single or combined without dash or only with space. For specific analyze what
  138. triggers are generated out of a part please read the function.
  139. =cut
  140. sub generate_triggers {
  141. my ( $self, $original_part ) = @_;
  142. my $part = $original_part;
  143. my @parts = (lc($part));
  144. $part =~ s/^!//g;
  145. push @parts, lc($part);
  146. $part =~ s/\?$//g;
  147. push @parts, lc($part);
  148. if ($part =~ m/(\W+)/ and $1 !~ m/'/) {
  149. my @boundary_words = split(/\W+/, $part);
  150. for my $boundary_word (@boundary_words) {
  151. push @parts, lc($boundary_word);
  152. }
  153. push @parts, lc($_) for @boundary_words;
  154. my $joined = join('', @boundary_words);
  155. push @parts, lc($joined);
  156. my $space_joined = join(' ', @boundary_words);
  157. push @parts, lc($space_joined);
  158. }
  159. return uniq sort @parts;
  160. }
  161. =method generate_remainder
  162. The method takes 2 index positions of L</query_raw_parts> to give out the other
  163. parts of the query which is ot between them, so removes those parts and
  164. generates out of the rest again a string which can be given to a plugin for
  165. example.
  166. It doesnt check which one is bigger, the first one must always be lower then
  167. the second one given. You can also just give one index position.
  168. =cut
  169. sub generate_remainder {
  170. my ( $self, $from_pos, $to_pos ) = @_;
  171. $to_pos = $from_pos unless defined $to_pos;
  172. my @query_raw_parts = @{$self->query_raw_parts};
  173. my $max = scalar @query_raw_parts-1;
  174. my $remainder = '';
  175. if ( $to_pos < $max && ( $from_pos == 0 || ( $from_pos == 2 && $query_raw_parts[0] eq '' ) ) ) {
  176. $remainder = join('',@query_raw_parts[$to_pos+1..$max]);
  177. $remainder =~ s/^\s//;
  178. } elsif ( $max % 2 ? $to_pos == $max-1 : $to_pos == $max ) {
  179. $remainder = join('',@query_raw_parts[0..$from_pos-1]);
  180. $remainder =~ s/\s$//;
  181. } else {
  182. my $left_remainder = join('',@query_raw_parts[0..$from_pos-1]);
  183. my $right_remainder = join('',@query_raw_parts[$to_pos+1..$max]);
  184. $left_remainder =~ s/\s$//;
  185. $right_remainder =~ s/^\s//;
  186. $remainder = $left_remainder.' '.$right_remainder;
  187. }
  188. return $remainder;
  189. }
  190. =attr query
  191. Takes L</query_parts> and join them with one space.
  192. =cut
  193. has query => (
  194. is => 'ro',
  195. lazy => 1,
  196. builder => '_build_query',
  197. );
  198. sub _build_query {
  199. join(' ',@{shift->query_parts})
  200. }
  201. =attr query_lc
  202. Takes L</query> and lowercases it.
  203. =cut
  204. has query_lc => (
  205. is => 'ro',
  206. lazy => 1,
  207. builder => '_build_query_lc',
  208. );
  209. sub _build_query_lc {
  210. lc(shift->query)
  211. }
  212. =attr query_nowhitespace
  213. Takes L</query> and removes all whitespaces.
  214. =cut
  215. has query_nowhitespace => (
  216. is => 'ro',
  217. lazy => 1,
  218. builder => '_build_query_nowhitespace',
  219. );
  220. sub _build_query_nowhitespace {
  221. for (shift->query) {
  222. s/$whitespaces//g;
  223. return $_;
  224. }
  225. }
  226. =attr query_nowhitespace_nodash
  227. Takes L</query> and removes all whitespaces and dashes.
  228. =cut
  229. has query_nowhitespace_nodash => (
  230. is => 'ro',
  231. lazy => 1,
  232. builder => '_build_query_nowhitespace_nodash',
  233. );
  234. sub _build_query_nowhitespace_nodash {
  235. for (shift->query) {
  236. s/$whitespaces_dashes//g;
  237. return $_;
  238. }
  239. }
  240. =attr query_clean
  241. Takes L</query_lc> and removes all whitespaces and all non alphanumeric ascii.
  242. =cut
  243. has query_clean => (
  244. is => 'ro',
  245. lazy => 1,
  246. builder => '_build_query_clean',
  247. );
  248. sub _build_query_clean {
  249. for (shift->query_lc) {
  250. s/$non_alphanumeric_ascii//g;
  251. s/$whitespaces/ /g;
  252. return $_;
  253. }
  254. }
  255. =attr words
  256. Takes L</query_clean> and generates an arrayref of the non-whitespace parts.
  257. =cut
  258. has words => (
  259. is => 'ro',
  260. lazy => 1,
  261. builder => '_build_words',
  262. );
  263. sub _build_words {
  264. [
  265. grep { length }
  266. split(/$whitespaces/,shift->query_clean)
  267. ]
  268. }
  269. =attr wordcount
  270. Is the count of the elements in L</words>
  271. =cut
  272. has wordcount => (
  273. is => 'ro',
  274. lazy => 1,
  275. builder => '_build_wordcount',
  276. );
  277. sub _build_wordcount { scalar @{shift->words} }
  278. =attr seen_plugins
  279. This array contains all the plugins which already worked with this request.
  280. This means all the plugins which are triggered. If they gave back a result or
  281. not, doesn't matter here. This list is used by L<DDG::Block/allow_duplicate>.
  282. =cut
  283. has seen_plugins => (
  284. is => 'rw',
  285. lazy => 1,
  286. builder => '_build_seen_plugins',
  287. );
  288. sub _build_seen_plugins {[]}
  289. #
  290. # LANGUAGE / LOCATION / IP
  291. #
  292. ###############################
  293. # DDG::Language TODO
  294. has language => (
  295. #isa => 'DDG::Language',
  296. is => 'ro',
  297. predicate => 'has_language',
  298. );
  299. sub lang { shift->language }
  300. has location => (
  301. #isa => 'DDG::Location',
  302. is => 'ro',
  303. predicate => 'has_location',
  304. );
  305. sub loc { shift->location }
  306. 1;